{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 2406, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012476606363069245, "grad_norm": 8.761757813219194, "learning_rate": 0.0, "loss": 2.2506, "num_tokens": 76455.0, "step": 1 }, { "epoch": 0.002495321272613849, "grad_norm": 8.78949353381417, "learning_rate": 1.36986301369863e-07, "loss": 2.2588, "num_tokens": 152591.0, "step": 2 }, { "epoch": 0.0037429819089207735, "grad_norm": 8.761353805328225, "learning_rate": 2.73972602739726e-07, "loss": 2.2518, "num_tokens": 228968.0, "step": 3 }, { "epoch": 0.004990642545227698, "grad_norm": 8.680409609244958, "learning_rate": 4.1095890410958903e-07, "loss": 2.2367, "num_tokens": 306355.0, "step": 4 }, { "epoch": 0.006238303181534623, "grad_norm": 8.8251922200434, "learning_rate": 5.47945205479452e-07, "loss": 2.2511, "num_tokens": 382390.0, "step": 5 }, { "epoch": 0.007485963817841547, "grad_norm": 8.678437336777996, "learning_rate": 6.849315068493151e-07, "loss": 2.2412, "num_tokens": 459014.0, "step": 6 }, { "epoch": 0.008733624454148471, "grad_norm": 8.63321084471758, "learning_rate": 8.219178082191781e-07, "loss": 2.2448, "num_tokens": 534258.0, "step": 7 }, { "epoch": 0.009981285090455396, "grad_norm": 8.511625206608334, "learning_rate": 9.589041095890411e-07, "loss": 2.2307, "num_tokens": 610291.0, "step": 8 }, { "epoch": 0.011228945726762321, "grad_norm": 8.300645950958717, "learning_rate": 1.095890410958904e-06, "loss": 2.1933, "num_tokens": 685797.0, "step": 9 }, { "epoch": 0.012476606363069246, "grad_norm": 7.8420598537636526, "learning_rate": 1.2328767123287673e-06, "loss": 2.1148, "num_tokens": 762054.0, "step": 10 }, { "epoch": 0.01372426699937617, "grad_norm": 7.620747695514949, "learning_rate": 1.3698630136986302e-06, "loss": 2.0744, "num_tokens": 839526.0, "step": 11 }, { "epoch": 0.014971927635683094, "grad_norm": 7.7311728817507674, "learning_rate": 1.5068493150684932e-06, "loss": 2.0946, "num_tokens": 915416.0, "step": 12 }, { "epoch": 0.016219588271990017, "grad_norm": 6.496044071412181, "learning_rate": 1.6438356164383561e-06, "loss": 1.8218, "num_tokens": 992432.0, "step": 13 }, { "epoch": 0.017467248908296942, "grad_norm": 6.466236172125798, "learning_rate": 1.7808219178082193e-06, "loss": 1.8035, "num_tokens": 1069191.0, "step": 14 }, { "epoch": 0.018714909544603867, "grad_norm": 6.272721593409537, "learning_rate": 1.9178082191780823e-06, "loss": 1.7429, "num_tokens": 1146441.0, "step": 15 }, { "epoch": 0.019962570180910792, "grad_norm": 6.294095568991284, "learning_rate": 2.0547945205479454e-06, "loss": 1.7267, "num_tokens": 1222771.0, "step": 16 }, { "epoch": 0.021210230817217717, "grad_norm": 6.86423665778992, "learning_rate": 2.191780821917808e-06, "loss": 1.1876, "num_tokens": 1298674.0, "step": 17 }, { "epoch": 0.022457891453524642, "grad_norm": 6.675922637460064, "learning_rate": 2.3287671232876713e-06, "loss": 1.1796, "num_tokens": 1374228.0, "step": 18 }, { "epoch": 0.023705552089831567, "grad_norm": 6.923846448363932, "learning_rate": 2.4657534246575345e-06, "loss": 1.1247, "num_tokens": 1451793.0, "step": 19 }, { "epoch": 0.024953212726138492, "grad_norm": 9.498586320399488, "learning_rate": 2.6027397260273973e-06, "loss": 0.9696, "num_tokens": 1528307.0, "step": 20 }, { "epoch": 0.026200873362445413, "grad_norm": 6.642114493263439, "learning_rate": 2.7397260273972604e-06, "loss": 0.8689, "num_tokens": 1604991.0, "step": 21 }, { "epoch": 0.02744853399875234, "grad_norm": 10.71018918516391, "learning_rate": 2.876712328767123e-06, "loss": 0.7623, "num_tokens": 1681870.0, "step": 22 }, { "epoch": 0.028696194635059263, "grad_norm": 5.930114665023217, "learning_rate": 3.0136986301369864e-06, "loss": 0.3494, "num_tokens": 1758838.0, "step": 23 }, { "epoch": 0.02994385527136619, "grad_norm": 2.883944763296146, "learning_rate": 3.1506849315068495e-06, "loss": 0.2483, "num_tokens": 1834793.0, "step": 24 }, { "epoch": 0.031191515907673113, "grad_norm": 2.6374171238506747, "learning_rate": 3.2876712328767123e-06, "loss": 0.2201, "num_tokens": 1911654.0, "step": 25 }, { "epoch": 0.032439176543980035, "grad_norm": 0.8688472783801062, "learning_rate": 3.4246575342465754e-06, "loss": 0.1953, "num_tokens": 1987054.0, "step": 26 }, { "epoch": 0.03368683718028696, "grad_norm": 0.6726353699266373, "learning_rate": 3.5616438356164386e-06, "loss": 0.1991, "num_tokens": 2063735.0, "step": 27 }, { "epoch": 0.034934497816593885, "grad_norm": 0.6879798139762083, "learning_rate": 3.6986301369863014e-06, "loss": 0.1865, "num_tokens": 2140736.0, "step": 28 }, { "epoch": 0.03618215845290081, "grad_norm": 0.5328446424800377, "learning_rate": 3.8356164383561645e-06, "loss": 0.1794, "num_tokens": 2215808.0, "step": 29 }, { "epoch": 0.037429819089207735, "grad_norm": 0.49572408846234306, "learning_rate": 3.972602739726027e-06, "loss": 0.1722, "num_tokens": 2291680.0, "step": 30 }, { "epoch": 0.03867747972551466, "grad_norm": 0.5123370045083006, "learning_rate": 4.109589041095891e-06, "loss": 0.1731, "num_tokens": 2367158.0, "step": 31 }, { "epoch": 0.039925140361821584, "grad_norm": 0.5664689020425665, "learning_rate": 4.246575342465754e-06, "loss": 0.1663, "num_tokens": 2442906.0, "step": 32 }, { "epoch": 0.041172800998128506, "grad_norm": 0.5299695266663934, "learning_rate": 4.383561643835616e-06, "loss": 0.1661, "num_tokens": 2518923.0, "step": 33 }, { "epoch": 0.042420461634435434, "grad_norm": 0.4311694293651315, "learning_rate": 4.52054794520548e-06, "loss": 0.1576, "num_tokens": 2594906.0, "step": 34 }, { "epoch": 0.043668122270742356, "grad_norm": 0.37623197449723156, "learning_rate": 4.657534246575343e-06, "loss": 0.1464, "num_tokens": 2670975.0, "step": 35 }, { "epoch": 0.044915782907049284, "grad_norm": 0.3043781819858105, "learning_rate": 4.7945205479452054e-06, "loss": 0.1484, "num_tokens": 2746676.0, "step": 36 }, { "epoch": 0.046163443543356206, "grad_norm": 0.24996202751880106, "learning_rate": 4.931506849315069e-06, "loss": 0.1467, "num_tokens": 2822529.0, "step": 37 }, { "epoch": 0.047411104179663134, "grad_norm": 0.2502460222418088, "learning_rate": 5.068493150684932e-06, "loss": 0.1531, "num_tokens": 2899091.0, "step": 38 }, { "epoch": 0.048658764815970056, "grad_norm": 0.21114797440871622, "learning_rate": 5.2054794520547945e-06, "loss": 0.1421, "num_tokens": 2975561.0, "step": 39 }, { "epoch": 0.049906425452276984, "grad_norm": 0.21983583172698634, "learning_rate": 5.342465753424658e-06, "loss": 0.1353, "num_tokens": 3051729.0, "step": 40 }, { "epoch": 0.051154086088583905, "grad_norm": 0.20644663877144454, "learning_rate": 5.479452054794521e-06, "loss": 0.1373, "num_tokens": 3127617.0, "step": 41 }, { "epoch": 0.05240174672489083, "grad_norm": 0.24835560615668648, "learning_rate": 5.6164383561643845e-06, "loss": 0.141, "num_tokens": 3204875.0, "step": 42 }, { "epoch": 0.053649407361197755, "grad_norm": 0.21591056284990343, "learning_rate": 5.753424657534246e-06, "loss": 0.1341, "num_tokens": 3281650.0, "step": 43 }, { "epoch": 0.05489706799750468, "grad_norm": 0.23616198046047548, "learning_rate": 5.89041095890411e-06, "loss": 0.1181, "num_tokens": 3356216.0, "step": 44 }, { "epoch": 0.056144728633811605, "grad_norm": 0.22200353016141697, "learning_rate": 6.027397260273973e-06, "loss": 0.139, "num_tokens": 3433001.0, "step": 45 }, { "epoch": 0.05739238927011853, "grad_norm": 0.25871574428667854, "learning_rate": 6.164383561643836e-06, "loss": 0.1275, "num_tokens": 3508082.0, "step": 46 }, { "epoch": 0.058640049906425455, "grad_norm": 0.21842107713575873, "learning_rate": 6.301369863013699e-06, "loss": 0.1265, "num_tokens": 3583716.0, "step": 47 }, { "epoch": 0.05988771054273238, "grad_norm": 0.24004259697156158, "learning_rate": 6.438356164383563e-06, "loss": 0.1242, "num_tokens": 3659753.0, "step": 48 }, { "epoch": 0.0611353711790393, "grad_norm": 0.2116092456996917, "learning_rate": 6.5753424657534245e-06, "loss": 0.1328, "num_tokens": 3736035.0, "step": 49 }, { "epoch": 0.06238303181534623, "grad_norm": 0.2013190481716377, "learning_rate": 6.712328767123288e-06, "loss": 0.1179, "num_tokens": 3811794.0, "step": 50 }, { "epoch": 0.06363069245165315, "grad_norm": 0.22264594627704687, "learning_rate": 6.849315068493151e-06, "loss": 0.1243, "num_tokens": 3888185.0, "step": 51 }, { "epoch": 0.06487835308796007, "grad_norm": 0.18870977013081056, "learning_rate": 6.9863013698630145e-06, "loss": 0.1216, "num_tokens": 3964653.0, "step": 52 }, { "epoch": 0.066126013724267, "grad_norm": 0.21848947971984553, "learning_rate": 7.123287671232877e-06, "loss": 0.1187, "num_tokens": 4040648.0, "step": 53 }, { "epoch": 0.06737367436057393, "grad_norm": 0.2103776859815552, "learning_rate": 7.260273972602741e-06, "loss": 0.1131, "num_tokens": 4117805.0, "step": 54 }, { "epoch": 0.06862133499688085, "grad_norm": 0.2005737766635463, "learning_rate": 7.397260273972603e-06, "loss": 0.1215, "num_tokens": 4194859.0, "step": 55 }, { "epoch": 0.06986899563318777, "grad_norm": 0.1956940878293591, "learning_rate": 7.534246575342466e-06, "loss": 0.111, "num_tokens": 4270378.0, "step": 56 }, { "epoch": 0.07111665626949469, "grad_norm": 0.19607223925616332, "learning_rate": 7.671232876712329e-06, "loss": 0.119, "num_tokens": 4346106.0, "step": 57 }, { "epoch": 0.07236431690580163, "grad_norm": 0.19170667845038655, "learning_rate": 7.808219178082192e-06, "loss": 0.1116, "num_tokens": 4421749.0, "step": 58 }, { "epoch": 0.07361197754210855, "grad_norm": 0.20616149253606764, "learning_rate": 7.945205479452055e-06, "loss": 0.1178, "num_tokens": 4499385.0, "step": 59 }, { "epoch": 0.07485963817841547, "grad_norm": 0.19654472797463204, "learning_rate": 8.082191780821919e-06, "loss": 0.1082, "num_tokens": 4575336.0, "step": 60 }, { "epoch": 0.07610729881472239, "grad_norm": 0.2112216421698123, "learning_rate": 8.219178082191782e-06, "loss": 0.1086, "num_tokens": 4650506.0, "step": 61 }, { "epoch": 0.07735495945102933, "grad_norm": 0.2076293442211523, "learning_rate": 8.356164383561644e-06, "loss": 0.1151, "num_tokens": 4726709.0, "step": 62 }, { "epoch": 0.07860262008733625, "grad_norm": 0.19682970614365813, "learning_rate": 8.493150684931507e-06, "loss": 0.1138, "num_tokens": 4802335.0, "step": 63 }, { "epoch": 0.07985028072364317, "grad_norm": 0.22630532906454626, "learning_rate": 8.63013698630137e-06, "loss": 0.1134, "num_tokens": 4879330.0, "step": 64 }, { "epoch": 0.08109794135995009, "grad_norm": 0.20005557766025198, "learning_rate": 8.767123287671233e-06, "loss": 0.1109, "num_tokens": 4956574.0, "step": 65 }, { "epoch": 0.08234560199625701, "grad_norm": 0.18919889491059635, "learning_rate": 8.904109589041097e-06, "loss": 0.1022, "num_tokens": 5032651.0, "step": 66 }, { "epoch": 0.08359326263256395, "grad_norm": 0.2220403365811989, "learning_rate": 9.04109589041096e-06, "loss": 0.1165, "num_tokens": 5110439.0, "step": 67 }, { "epoch": 0.08484092326887087, "grad_norm": 0.22886272632277932, "learning_rate": 9.178082191780823e-06, "loss": 0.0964, "num_tokens": 5186001.0, "step": 68 }, { "epoch": 0.08608858390517779, "grad_norm": 0.19831357939942088, "learning_rate": 9.315068493150685e-06, "loss": 0.105, "num_tokens": 5261891.0, "step": 69 }, { "epoch": 0.08733624454148471, "grad_norm": 0.22099512786602915, "learning_rate": 9.452054794520548e-06, "loss": 0.1008, "num_tokens": 5337002.0, "step": 70 }, { "epoch": 0.08858390517779165, "grad_norm": 0.18641406929882653, "learning_rate": 9.589041095890411e-06, "loss": 0.1052, "num_tokens": 5413796.0, "step": 71 }, { "epoch": 0.08983156581409857, "grad_norm": 0.19203725614575334, "learning_rate": 9.726027397260275e-06, "loss": 0.1012, "num_tokens": 5489737.0, "step": 72 }, { "epoch": 0.09107922645040549, "grad_norm": 0.20246710152992942, "learning_rate": 9.863013698630138e-06, "loss": 0.1048, "num_tokens": 5564974.0, "step": 73 }, { "epoch": 0.09232688708671241, "grad_norm": 0.1876916336068445, "learning_rate": 1e-05, "loss": 0.1013, "num_tokens": 5641815.0, "step": 74 }, { "epoch": 0.09357454772301933, "grad_norm": 0.21390292979614334, "learning_rate": 9.999995920069922e-06, "loss": 0.1029, "num_tokens": 5718359.0, "step": 75 }, { "epoch": 0.09482220835932627, "grad_norm": 0.2017756259599263, "learning_rate": 9.999983680287084e-06, "loss": 0.1063, "num_tokens": 5794874.0, "step": 76 }, { "epoch": 0.09606986899563319, "grad_norm": 0.18714097159366816, "learning_rate": 9.99996328067368e-06, "loss": 0.0957, "num_tokens": 5870956.0, "step": 77 }, { "epoch": 0.09731752963194011, "grad_norm": 0.18505126145277956, "learning_rate": 9.999934721266702e-06, "loss": 0.0908, "num_tokens": 5946585.0, "step": 78 }, { "epoch": 0.09856519026824703, "grad_norm": 0.19156875617520439, "learning_rate": 9.999898002117937e-06, "loss": 0.095, "num_tokens": 6021741.0, "step": 79 }, { "epoch": 0.09981285090455397, "grad_norm": 0.1929723855918604, "learning_rate": 9.999853123293967e-06, "loss": 0.1011, "num_tokens": 6097082.0, "step": 80 }, { "epoch": 0.10106051154086089, "grad_norm": 0.2001901241869327, "learning_rate": 9.99980008487617e-06, "loss": 0.1006, "num_tokens": 6173503.0, "step": 81 }, { "epoch": 0.10230817217716781, "grad_norm": 0.18306362250643257, "learning_rate": 9.999738886960724e-06, "loss": 0.0983, "num_tokens": 6251913.0, "step": 82 }, { "epoch": 0.10355583281347473, "grad_norm": 0.18878076652513084, "learning_rate": 9.999669529658596e-06, "loss": 0.0972, "num_tokens": 6327736.0, "step": 83 }, { "epoch": 0.10480349344978165, "grad_norm": 0.17758975161938698, "learning_rate": 9.999592013095553e-06, "loss": 0.0919, "num_tokens": 6403683.0, "step": 84 }, { "epoch": 0.10605115408608859, "grad_norm": 0.18599288991127097, "learning_rate": 9.999506337412157e-06, "loss": 0.0952, "num_tokens": 6479398.0, "step": 85 }, { "epoch": 0.10729881472239551, "grad_norm": 0.18920269409828955, "learning_rate": 9.99941250276376e-06, "loss": 0.0939, "num_tokens": 6555703.0, "step": 86 }, { "epoch": 0.10854647535870243, "grad_norm": 0.16667220843732977, "learning_rate": 9.999310509320518e-06, "loss": 0.0845, "num_tokens": 6631800.0, "step": 87 }, { "epoch": 0.10979413599500935, "grad_norm": 0.19074803832408102, "learning_rate": 9.999200357267373e-06, "loss": 0.0968, "num_tokens": 6708923.0, "step": 88 }, { "epoch": 0.11104179663131628, "grad_norm": 0.17189646671230804, "learning_rate": 9.999082046804062e-06, "loss": 0.0908, "num_tokens": 6784265.0, "step": 89 }, { "epoch": 0.11228945726762321, "grad_norm": 0.1887585644537061, "learning_rate": 9.998955578145124e-06, "loss": 0.1033, "num_tokens": 6861631.0, "step": 90 }, { "epoch": 0.11353711790393013, "grad_norm": 0.19093170392495093, "learning_rate": 9.998820951519877e-06, "loss": 0.0957, "num_tokens": 6937782.0, "step": 91 }, { "epoch": 0.11478477854023705, "grad_norm": 0.18068911481247474, "learning_rate": 9.998678167172446e-06, "loss": 0.0916, "num_tokens": 7013064.0, "step": 92 }, { "epoch": 0.11603243917654397, "grad_norm": 0.1807867354947808, "learning_rate": 9.99852722536174e-06, "loss": 0.0912, "num_tokens": 7089477.0, "step": 93 }, { "epoch": 0.11728009981285091, "grad_norm": 0.19214351630835524, "learning_rate": 9.998368126361459e-06, "loss": 0.089, "num_tokens": 7165616.0, "step": 94 }, { "epoch": 0.11852776044915783, "grad_norm": 0.1703054533344699, "learning_rate": 9.998200870460103e-06, "loss": 0.0894, "num_tokens": 7241528.0, "step": 95 }, { "epoch": 0.11977542108546475, "grad_norm": 0.19018854305829216, "learning_rate": 9.998025457960955e-06, "loss": 0.0907, "num_tokens": 7317982.0, "step": 96 }, { "epoch": 0.12102308172177167, "grad_norm": 0.18133063854407905, "learning_rate": 9.997841889182091e-06, "loss": 0.0908, "num_tokens": 7394173.0, "step": 97 }, { "epoch": 0.1222707423580786, "grad_norm": 0.20643507465810595, "learning_rate": 9.997650164456375e-06, "loss": 0.0959, "num_tokens": 7471763.0, "step": 98 }, { "epoch": 0.12351840299438553, "grad_norm": 0.19177798797637297, "learning_rate": 9.997450284131465e-06, "loss": 0.0859, "num_tokens": 7547218.0, "step": 99 }, { "epoch": 0.12476606363069245, "grad_norm": 0.18410460168165857, "learning_rate": 9.997242248569802e-06, "loss": 0.0929, "num_tokens": 7623208.0, "step": 100 }, { "epoch": 0.1260137242669994, "grad_norm": 0.17296669023164052, "learning_rate": 9.997026058148617e-06, "loss": 0.0867, "num_tokens": 7700131.0, "step": 101 }, { "epoch": 0.1272613849033063, "grad_norm": 0.19233260273716293, "learning_rate": 9.996801713259933e-06, "loss": 0.0911, "num_tokens": 7775303.0, "step": 102 }, { "epoch": 0.12850904553961323, "grad_norm": 0.1796254555341303, "learning_rate": 9.996569214310549e-06, "loss": 0.0842, "num_tokens": 7850968.0, "step": 103 }, { "epoch": 0.12975670617592014, "grad_norm": 0.17463731855234868, "learning_rate": 9.99632856172206e-06, "loss": 0.09, "num_tokens": 7928653.0, "step": 104 }, { "epoch": 0.13100436681222707, "grad_norm": 0.18296072966873897, "learning_rate": 9.99607975593084e-06, "loss": 0.093, "num_tokens": 8004382.0, "step": 105 }, { "epoch": 0.132252027448534, "grad_norm": 0.17517424828901534, "learning_rate": 9.995822797388052e-06, "loss": 0.0884, "num_tokens": 8080346.0, "step": 106 }, { "epoch": 0.13349968808484092, "grad_norm": 0.18374538686707345, "learning_rate": 9.995557686559635e-06, "loss": 0.0902, "num_tokens": 8157642.0, "step": 107 }, { "epoch": 0.13474734872114785, "grad_norm": 0.17640658082095845, "learning_rate": 9.995284423926318e-06, "loss": 0.0835, "num_tokens": 8233301.0, "step": 108 }, { "epoch": 0.13599500935745476, "grad_norm": 0.1778279324200043, "learning_rate": 9.995003009983608e-06, "loss": 0.0871, "num_tokens": 8308683.0, "step": 109 }, { "epoch": 0.1372426699937617, "grad_norm": 0.17643769063186895, "learning_rate": 9.994713445241793e-06, "loss": 0.0859, "num_tokens": 8384876.0, "step": 110 }, { "epoch": 0.13849033063006863, "grad_norm": 0.1713769310660126, "learning_rate": 9.994415730225943e-06, "loss": 0.085, "num_tokens": 8461189.0, "step": 111 }, { "epoch": 0.13973799126637554, "grad_norm": 0.16535183080331606, "learning_rate": 9.994109865475903e-06, "loss": 0.087, "num_tokens": 8536969.0, "step": 112 }, { "epoch": 0.14098565190268247, "grad_norm": 0.1713983338918515, "learning_rate": 9.993795851546302e-06, "loss": 0.0893, "num_tokens": 8613651.0, "step": 113 }, { "epoch": 0.14223331253898938, "grad_norm": 0.1797062763194263, "learning_rate": 9.993473689006538e-06, "loss": 0.0877, "num_tokens": 8689954.0, "step": 114 }, { "epoch": 0.14348097317529632, "grad_norm": 0.18257913976658624, "learning_rate": 9.99314337844079e-06, "loss": 0.0918, "num_tokens": 8765928.0, "step": 115 }, { "epoch": 0.14472863381160325, "grad_norm": 0.17867571616347352, "learning_rate": 9.992804920448013e-06, "loss": 0.0826, "num_tokens": 8841922.0, "step": 116 }, { "epoch": 0.14597629444791016, "grad_norm": 0.16778802320918987, "learning_rate": 9.992458315641932e-06, "loss": 0.0848, "num_tokens": 8917607.0, "step": 117 }, { "epoch": 0.1472239550842171, "grad_norm": 0.16474267413213903, "learning_rate": 9.992103564651048e-06, "loss": 0.0813, "num_tokens": 8993184.0, "step": 118 }, { "epoch": 0.14847161572052403, "grad_norm": 0.1841405830448607, "learning_rate": 9.991740668118629e-06, "loss": 0.0837, "num_tokens": 9069615.0, "step": 119 }, { "epoch": 0.14971927635683094, "grad_norm": 0.16992784576092151, "learning_rate": 9.991369626702717e-06, "loss": 0.0844, "num_tokens": 9146612.0, "step": 120 }, { "epoch": 0.15096693699313787, "grad_norm": 0.17131842491068652, "learning_rate": 9.990990441076125e-06, "loss": 0.0818, "num_tokens": 9222704.0, "step": 121 }, { "epoch": 0.15221459762944478, "grad_norm": 0.17804714059607288, "learning_rate": 9.990603111926424e-06, "loss": 0.0858, "num_tokens": 9298555.0, "step": 122 }, { "epoch": 0.15346225826575172, "grad_norm": 0.1729551798123775, "learning_rate": 9.990207639955969e-06, "loss": 0.0873, "num_tokens": 9374922.0, "step": 123 }, { "epoch": 0.15470991890205865, "grad_norm": 0.18111977631889967, "learning_rate": 9.989804025881862e-06, "loss": 0.0839, "num_tokens": 9450879.0, "step": 124 }, { "epoch": 0.15595757953836556, "grad_norm": 0.17388225325106033, "learning_rate": 9.98939227043598e-06, "loss": 0.088, "num_tokens": 9527736.0, "step": 125 }, { "epoch": 0.1572052401746725, "grad_norm": 0.17758998450950486, "learning_rate": 9.988972374364961e-06, "loss": 0.086, "num_tokens": 9603872.0, "step": 126 }, { "epoch": 0.1584529008109794, "grad_norm": 0.16662865144228015, "learning_rate": 9.988544338430203e-06, "loss": 0.0772, "num_tokens": 9679807.0, "step": 127 }, { "epoch": 0.15970056144728634, "grad_norm": 0.1734848561541322, "learning_rate": 9.988108163407865e-06, "loss": 0.0857, "num_tokens": 9755823.0, "step": 128 }, { "epoch": 0.16094822208359327, "grad_norm": 0.1711057297356826, "learning_rate": 9.987663850088862e-06, "loss": 0.0815, "num_tokens": 9832201.0, "step": 129 }, { "epoch": 0.16219588271990018, "grad_norm": 0.18609650663984184, "learning_rate": 9.987211399278871e-06, "loss": 0.0884, "num_tokens": 9908854.0, "step": 130 }, { "epoch": 0.16344354335620712, "grad_norm": 0.16781225049301668, "learning_rate": 9.98675081179832e-06, "loss": 0.0847, "num_tokens": 9985482.0, "step": 131 }, { "epoch": 0.16469120399251402, "grad_norm": 0.1666096665089819, "learning_rate": 9.986282088482397e-06, "loss": 0.082, "num_tokens": 10061705.0, "step": 132 }, { "epoch": 0.16593886462882096, "grad_norm": 0.16861260460290664, "learning_rate": 9.985805230181031e-06, "loss": 0.0775, "num_tokens": 10137791.0, "step": 133 }, { "epoch": 0.1671865252651279, "grad_norm": 0.16847932298131205, "learning_rate": 9.985320237758918e-06, "loss": 0.0788, "num_tokens": 10213758.0, "step": 134 }, { "epoch": 0.1684341859014348, "grad_norm": 0.16847673392353724, "learning_rate": 9.984827112095495e-06, "loss": 0.0789, "num_tokens": 10289192.0, "step": 135 }, { "epoch": 0.16968184653774174, "grad_norm": 0.17266317511461257, "learning_rate": 9.984325854084946e-06, "loss": 0.0792, "num_tokens": 10366976.0, "step": 136 }, { "epoch": 0.17092950717404864, "grad_norm": 0.17254331344081836, "learning_rate": 9.983816464636203e-06, "loss": 0.0787, "num_tokens": 10442519.0, "step": 137 }, { "epoch": 0.17217716781035558, "grad_norm": 0.16483644978279038, "learning_rate": 9.983298944672942e-06, "loss": 0.0761, "num_tokens": 10518729.0, "step": 138 }, { "epoch": 0.17342482844666252, "grad_norm": 0.16316748420591484, "learning_rate": 9.982773295133585e-06, "loss": 0.0749, "num_tokens": 10594664.0, "step": 139 }, { "epoch": 0.17467248908296942, "grad_norm": 0.17456574782390052, "learning_rate": 9.982239516971295e-06, "loss": 0.0826, "num_tokens": 10669996.0, "step": 140 }, { "epoch": 0.17592014971927636, "grad_norm": 0.17564099775879038, "learning_rate": 9.98169761115397e-06, "loss": 0.1012, "num_tokens": 10746016.0, "step": 141 }, { "epoch": 0.1771678103555833, "grad_norm": 0.16763817496083627, "learning_rate": 9.98114757866425e-06, "loss": 0.0732, "num_tokens": 10821806.0, "step": 142 }, { "epoch": 0.1784154709918902, "grad_norm": 0.179538549410821, "learning_rate": 9.980589420499512e-06, "loss": 0.0802, "num_tokens": 10898660.0, "step": 143 }, { "epoch": 0.17966313162819714, "grad_norm": 0.16674096787913362, "learning_rate": 9.980023137671862e-06, "loss": 0.0715, "num_tokens": 10974243.0, "step": 144 }, { "epoch": 0.18091079226450404, "grad_norm": 0.1822510753659331, "learning_rate": 9.979448731208145e-06, "loss": 0.078, "num_tokens": 11050629.0, "step": 145 }, { "epoch": 0.18215845290081098, "grad_norm": 0.16924546334327153, "learning_rate": 9.978866202149931e-06, "loss": 0.0781, "num_tokens": 11126941.0, "step": 146 }, { "epoch": 0.18340611353711792, "grad_norm": 0.1677901047109414, "learning_rate": 9.978275551553526e-06, "loss": 0.0799, "num_tokens": 11203771.0, "step": 147 }, { "epoch": 0.18465377417342482, "grad_norm": 0.18308955197895166, "learning_rate": 9.977676780489953e-06, "loss": 0.08, "num_tokens": 11279637.0, "step": 148 }, { "epoch": 0.18590143480973176, "grad_norm": 0.17364312210957755, "learning_rate": 9.977069890044965e-06, "loss": 0.0768, "num_tokens": 11354687.0, "step": 149 }, { "epoch": 0.18714909544603867, "grad_norm": 0.1574248157414426, "learning_rate": 9.976454881319041e-06, "loss": 0.0759, "num_tokens": 11430507.0, "step": 150 }, { "epoch": 0.1883967560823456, "grad_norm": 0.16705589510506227, "learning_rate": 9.975831755427376e-06, "loss": 0.0804, "num_tokens": 11509344.0, "step": 151 }, { "epoch": 0.18964441671865254, "grad_norm": 0.18306193714265942, "learning_rate": 9.975200513499886e-06, "loss": 0.08, "num_tokens": 11585593.0, "step": 152 }, { "epoch": 0.19089207735495944, "grad_norm": 0.1706796151807509, "learning_rate": 9.974561156681203e-06, "loss": 0.0805, "num_tokens": 11661285.0, "step": 153 }, { "epoch": 0.19213973799126638, "grad_norm": 0.1760002317848842, "learning_rate": 9.973913686130674e-06, "loss": 0.0785, "num_tokens": 11737965.0, "step": 154 }, { "epoch": 0.1933873986275733, "grad_norm": 0.17124129463592325, "learning_rate": 9.973258103022361e-06, "loss": 0.0769, "num_tokens": 11813675.0, "step": 155 }, { "epoch": 0.19463505926388022, "grad_norm": 0.17169440588195126, "learning_rate": 9.97259440854503e-06, "loss": 0.0809, "num_tokens": 11890399.0, "step": 156 }, { "epoch": 0.19588271990018716, "grad_norm": 0.16409420786702833, "learning_rate": 9.971922603902164e-06, "loss": 0.0777, "num_tokens": 11967839.0, "step": 157 }, { "epoch": 0.19713038053649407, "grad_norm": 0.16387759306170735, "learning_rate": 9.971242690311944e-06, "loss": 0.07, "num_tokens": 12042776.0, "step": 158 }, { "epoch": 0.198378041172801, "grad_norm": 0.1718471715592683, "learning_rate": 9.970554669007264e-06, "loss": 0.0765, "num_tokens": 12118641.0, "step": 159 }, { "epoch": 0.19962570180910794, "grad_norm": 0.17138705121466746, "learning_rate": 9.969858541235708e-06, "loss": 0.087, "num_tokens": 12196351.0, "step": 160 }, { "epoch": 0.20087336244541484, "grad_norm": 0.1581669843206168, "learning_rate": 9.969154308259572e-06, "loss": 0.0758, "num_tokens": 12272544.0, "step": 161 }, { "epoch": 0.20212102308172178, "grad_norm": 0.17516159950920784, "learning_rate": 9.968441971355839e-06, "loss": 0.0761, "num_tokens": 12348033.0, "step": 162 }, { "epoch": 0.2033686837180287, "grad_norm": 0.16893850093337096, "learning_rate": 9.967721531816194e-06, "loss": 0.0769, "num_tokens": 12424598.0, "step": 163 }, { "epoch": 0.20461634435433562, "grad_norm": 0.18632279546740235, "learning_rate": 9.96699299094701e-06, "loss": 0.0807, "num_tokens": 12500157.0, "step": 164 }, { "epoch": 0.20586400499064256, "grad_norm": 0.1830366676336023, "learning_rate": 9.966256350069355e-06, "loss": 0.081, "num_tokens": 12577080.0, "step": 165 }, { "epoch": 0.20711166562694946, "grad_norm": 0.16985754100471287, "learning_rate": 9.965511610518975e-06, "loss": 0.085, "num_tokens": 12654089.0, "step": 166 }, { "epoch": 0.2083593262632564, "grad_norm": 0.18253459217534665, "learning_rate": 9.964758773646314e-06, "loss": 0.072, "num_tokens": 12729570.0, "step": 167 }, { "epoch": 0.2096069868995633, "grad_norm": 0.1562839597111707, "learning_rate": 9.963997840816491e-06, "loss": 0.0742, "num_tokens": 12805646.0, "step": 168 }, { "epoch": 0.21085464753587024, "grad_norm": 0.1636269780844281, "learning_rate": 9.963228813409307e-06, "loss": 0.0705, "num_tokens": 12880630.0, "step": 169 }, { "epoch": 0.21210230817217718, "grad_norm": 0.1693391795899073, "learning_rate": 9.962451692819238e-06, "loss": 0.0805, "num_tokens": 12958382.0, "step": 170 }, { "epoch": 0.2133499688084841, "grad_norm": 0.15840163722279885, "learning_rate": 9.961666480455445e-06, "loss": 0.0714, "num_tokens": 13032898.0, "step": 171 }, { "epoch": 0.21459762944479102, "grad_norm": 0.18996842682963302, "learning_rate": 9.96087317774175e-06, "loss": 0.0781, "num_tokens": 13111574.0, "step": 172 }, { "epoch": 0.21584529008109793, "grad_norm": 0.16107199270740594, "learning_rate": 9.960071786116652e-06, "loss": 0.0751, "num_tokens": 13187805.0, "step": 173 }, { "epoch": 0.21709295071740486, "grad_norm": 0.1646727452686491, "learning_rate": 9.959262307033318e-06, "loss": 0.0783, "num_tokens": 13266711.0, "step": 174 }, { "epoch": 0.2183406113537118, "grad_norm": 0.1684533239322113, "learning_rate": 9.958444741959577e-06, "loss": 0.0733, "num_tokens": 13342080.0, "step": 175 }, { "epoch": 0.2195882719900187, "grad_norm": 0.16224608231089951, "learning_rate": 9.957619092377921e-06, "loss": 0.0723, "num_tokens": 13417353.0, "step": 176 }, { "epoch": 0.22083593262632564, "grad_norm": 0.16923571024991296, "learning_rate": 9.956785359785501e-06, "loss": 0.0714, "num_tokens": 13493523.0, "step": 177 }, { "epoch": 0.22208359326263255, "grad_norm": 0.1726436941795396, "learning_rate": 9.95594354569413e-06, "loss": 0.0733, "num_tokens": 13569377.0, "step": 178 }, { "epoch": 0.22333125389893949, "grad_norm": 0.1716907397301449, "learning_rate": 9.955093651630271e-06, "loss": 0.0745, "num_tokens": 13646284.0, "step": 179 }, { "epoch": 0.22457891453524642, "grad_norm": 0.17530570629038625, "learning_rate": 9.954235679135035e-06, "loss": 0.0749, "num_tokens": 13723722.0, "step": 180 }, { "epoch": 0.22582657517155333, "grad_norm": 0.1697074424612026, "learning_rate": 9.953369629764187e-06, "loss": 0.0736, "num_tokens": 13799090.0, "step": 181 }, { "epoch": 0.22707423580786026, "grad_norm": 0.16447687811649017, "learning_rate": 9.952495505088138e-06, "loss": 0.0744, "num_tokens": 13878148.0, "step": 182 }, { "epoch": 0.2283218964441672, "grad_norm": 0.15744871056244594, "learning_rate": 9.95161330669194e-06, "loss": 0.0716, "num_tokens": 13954623.0, "step": 183 }, { "epoch": 0.2295695570804741, "grad_norm": 0.17041782127698285, "learning_rate": 9.950723036175282e-06, "loss": 0.072, "num_tokens": 14030485.0, "step": 184 }, { "epoch": 0.23081721771678104, "grad_norm": 0.16544306500314798, "learning_rate": 9.9498246951525e-06, "loss": 0.0708, "num_tokens": 14106208.0, "step": 185 }, { "epoch": 0.23206487835308795, "grad_norm": 0.16364316998876874, "learning_rate": 9.948918285252551e-06, "loss": 0.0714, "num_tokens": 14182086.0, "step": 186 }, { "epoch": 0.23331253898939489, "grad_norm": 0.17464505187104798, "learning_rate": 9.948003808119034e-06, "loss": 0.0671, "num_tokens": 14257948.0, "step": 187 }, { "epoch": 0.23456019962570182, "grad_norm": 0.16183565234470615, "learning_rate": 9.94708126541017e-06, "loss": 0.0725, "num_tokens": 14334070.0, "step": 188 }, { "epoch": 0.23580786026200873, "grad_norm": 0.1607703903980471, "learning_rate": 9.94615065879881e-06, "loss": 0.0658, "num_tokens": 14408757.0, "step": 189 }, { "epoch": 0.23705552089831566, "grad_norm": 0.16763160398244317, "learning_rate": 9.945211989972425e-06, "loss": 0.0784, "num_tokens": 14485925.0, "step": 190 }, { "epoch": 0.23830318153462257, "grad_norm": 0.1798190055677699, "learning_rate": 9.944265260633105e-06, "loss": 0.0805, "num_tokens": 14561749.0, "step": 191 }, { "epoch": 0.2395508421709295, "grad_norm": 0.16757844224240012, "learning_rate": 9.943310472497556e-06, "loss": 0.0653, "num_tokens": 14636925.0, "step": 192 }, { "epoch": 0.24079850280723644, "grad_norm": 0.17170337264420482, "learning_rate": 9.942347627297095e-06, "loss": 0.0674, "num_tokens": 14712720.0, "step": 193 }, { "epoch": 0.24204616344354335, "grad_norm": 0.184755369945989, "learning_rate": 9.941376726777656e-06, "loss": 0.0771, "num_tokens": 14789345.0, "step": 194 }, { "epoch": 0.24329382407985028, "grad_norm": 0.16669838986318447, "learning_rate": 9.940397772699773e-06, "loss": 0.0739, "num_tokens": 14865183.0, "step": 195 }, { "epoch": 0.2445414847161572, "grad_norm": 0.16033230892360856, "learning_rate": 9.939410766838586e-06, "loss": 0.0689, "num_tokens": 14941021.0, "step": 196 }, { "epoch": 0.24578914535246413, "grad_norm": 0.15982589644947345, "learning_rate": 9.938415710983834e-06, "loss": 0.0696, "num_tokens": 15016682.0, "step": 197 }, { "epoch": 0.24703680598877106, "grad_norm": 0.16352167410709095, "learning_rate": 9.937412606939854e-06, "loss": 0.0744, "num_tokens": 15093755.0, "step": 198 }, { "epoch": 0.24828446662507797, "grad_norm": 0.16007627601527916, "learning_rate": 9.936401456525578e-06, "loss": 0.0693, "num_tokens": 15168241.0, "step": 199 }, { "epoch": 0.2495321272613849, "grad_norm": 0.18748749103582008, "learning_rate": 9.935382261574527e-06, "loss": 0.07, "num_tokens": 15243827.0, "step": 200 }, { "epoch": 0.25077978789769184, "grad_norm": 0.16246756040982152, "learning_rate": 9.934355023934808e-06, "loss": 0.0677, "num_tokens": 15319199.0, "step": 201 }, { "epoch": 0.2520274485339988, "grad_norm": 0.15613654661468085, "learning_rate": 9.933319745469117e-06, "loss": 0.0633, "num_tokens": 15394502.0, "step": 202 }, { "epoch": 0.25327510917030566, "grad_norm": 0.16230294430226105, "learning_rate": 9.932276428054723e-06, "loss": 0.0666, "num_tokens": 15469765.0, "step": 203 }, { "epoch": 0.2545227698066126, "grad_norm": 0.17049974841139676, "learning_rate": 9.931225073583476e-06, "loss": 0.0692, "num_tokens": 15546008.0, "step": 204 }, { "epoch": 0.2557704304429195, "grad_norm": 0.1791622410807636, "learning_rate": 9.930165683961803e-06, "loss": 0.0667, "num_tokens": 15622373.0, "step": 205 }, { "epoch": 0.25701809107922646, "grad_norm": 0.15668763943680733, "learning_rate": 9.929098261110694e-06, "loss": 0.065, "num_tokens": 15697707.0, "step": 206 }, { "epoch": 0.2582657517155334, "grad_norm": 0.17590506153752944, "learning_rate": 9.92802280696571e-06, "loss": 0.0691, "num_tokens": 15773700.0, "step": 207 }, { "epoch": 0.2595134123518403, "grad_norm": 0.17180051970570445, "learning_rate": 9.926939323476976e-06, "loss": 0.0721, "num_tokens": 15850214.0, "step": 208 }, { "epoch": 0.2607610729881472, "grad_norm": 0.16004289569776617, "learning_rate": 9.925847812609174e-06, "loss": 0.065, "num_tokens": 15925909.0, "step": 209 }, { "epoch": 0.26200873362445415, "grad_norm": 0.16249065891465686, "learning_rate": 9.924748276341541e-06, "loss": 0.0707, "num_tokens": 16002947.0, "step": 210 }, { "epoch": 0.2632563942607611, "grad_norm": 0.16482669351968468, "learning_rate": 9.923640716667872e-06, "loss": 0.0707, "num_tokens": 16079504.0, "step": 211 }, { "epoch": 0.264504054897068, "grad_norm": 0.1580233903929819, "learning_rate": 9.922525135596507e-06, "loss": 0.067, "num_tokens": 16154237.0, "step": 212 }, { "epoch": 0.2657517155333749, "grad_norm": 0.1721102538638486, "learning_rate": 9.92140153515033e-06, "loss": 0.0731, "num_tokens": 16231125.0, "step": 213 }, { "epoch": 0.26699937616968183, "grad_norm": 0.16202985302325928, "learning_rate": 9.92026991736677e-06, "loss": 0.0724, "num_tokens": 16307288.0, "step": 214 }, { "epoch": 0.26824703680598877, "grad_norm": 0.16552773840332874, "learning_rate": 9.919130284297791e-06, "loss": 0.0741, "num_tokens": 16383863.0, "step": 215 }, { "epoch": 0.2694946974422957, "grad_norm": 0.16797535769586705, "learning_rate": 9.917982638009891e-06, "loss": 0.0722, "num_tokens": 16459819.0, "step": 216 }, { "epoch": 0.27074235807860264, "grad_norm": 0.15652756641636162, "learning_rate": 9.916826980584103e-06, "loss": 0.0706, "num_tokens": 16536109.0, "step": 217 }, { "epoch": 0.2719900187149095, "grad_norm": 0.1579731989627712, "learning_rate": 9.91566331411598e-06, "loss": 0.0732, "num_tokens": 16612285.0, "step": 218 }, { "epoch": 0.27323767935121646, "grad_norm": 0.16458952129615206, "learning_rate": 9.914491640715603e-06, "loss": 0.069, "num_tokens": 16688566.0, "step": 219 }, { "epoch": 0.2744853399875234, "grad_norm": 0.17577595597550605, "learning_rate": 9.913311962507569e-06, "loss": 0.0736, "num_tokens": 16765400.0, "step": 220 }, { "epoch": 0.2757330006238303, "grad_norm": 0.1555054279626727, "learning_rate": 9.912124281630991e-06, "loss": 0.0663, "num_tokens": 16841350.0, "step": 221 }, { "epoch": 0.27698066126013726, "grad_norm": 0.16334335494111618, "learning_rate": 9.910928600239493e-06, "loss": 0.0702, "num_tokens": 16917039.0, "step": 222 }, { "epoch": 0.27822832189644414, "grad_norm": 0.168630513058605, "learning_rate": 9.909724920501207e-06, "loss": 0.072, "num_tokens": 16993479.0, "step": 223 }, { "epoch": 0.2794759825327511, "grad_norm": 0.1601670962324957, "learning_rate": 9.90851324459877e-06, "loss": 0.0712, "num_tokens": 17070088.0, "step": 224 }, { "epoch": 0.280723643169058, "grad_norm": 0.15618610848645934, "learning_rate": 9.907293574729317e-06, "loss": 0.0665, "num_tokens": 17146961.0, "step": 225 }, { "epoch": 0.28197130380536495, "grad_norm": 0.16271573902028957, "learning_rate": 9.906065913104474e-06, "loss": 0.0676, "num_tokens": 17222177.0, "step": 226 }, { "epoch": 0.2832189644416719, "grad_norm": 0.16139454951559776, "learning_rate": 9.904830261950366e-06, "loss": 0.0609, "num_tokens": 17297238.0, "step": 227 }, { "epoch": 0.28446662507797876, "grad_norm": 0.15660531460344787, "learning_rate": 9.903586623507603e-06, "loss": 0.0652, "num_tokens": 17372069.0, "step": 228 }, { "epoch": 0.2857142857142857, "grad_norm": 0.14936446527678607, "learning_rate": 9.902335000031273e-06, "loss": 0.0661, "num_tokens": 17447408.0, "step": 229 }, { "epoch": 0.28696194635059263, "grad_norm": 0.1718394050631786, "learning_rate": 9.901075393790953e-06, "loss": 0.0691, "num_tokens": 17524605.0, "step": 230 }, { "epoch": 0.28820960698689957, "grad_norm": 0.1614819161412718, "learning_rate": 9.899807807070684e-06, "loss": 0.0637, "num_tokens": 17600453.0, "step": 231 }, { "epoch": 0.2894572676232065, "grad_norm": 0.16194959667978912, "learning_rate": 9.898532242168987e-06, "loss": 0.0663, "num_tokens": 17675636.0, "step": 232 }, { "epoch": 0.29070492825951344, "grad_norm": 0.2044575062693707, "learning_rate": 9.897248701398848e-06, "loss": 0.0717, "num_tokens": 17751533.0, "step": 233 }, { "epoch": 0.2919525888958203, "grad_norm": 0.17517502746183747, "learning_rate": 9.895957187087713e-06, "loss": 0.0675, "num_tokens": 17827827.0, "step": 234 }, { "epoch": 0.29320024953212726, "grad_norm": 0.1615899115066392, "learning_rate": 9.894657701577488e-06, "loss": 0.0706, "num_tokens": 17903877.0, "step": 235 }, { "epoch": 0.2944479101684342, "grad_norm": 0.1574811806997747, "learning_rate": 9.893350247224532e-06, "loss": 0.0652, "num_tokens": 17979658.0, "step": 236 }, { "epoch": 0.2956955708047411, "grad_norm": 0.16378983393360064, "learning_rate": 9.892034826399657e-06, "loss": 0.0676, "num_tokens": 18055261.0, "step": 237 }, { "epoch": 0.29694323144104806, "grad_norm": 0.15078063149117402, "learning_rate": 9.890711441488117e-06, "loss": 0.0653, "num_tokens": 18131915.0, "step": 238 }, { "epoch": 0.29819089207735494, "grad_norm": 0.171614702706413, "learning_rate": 9.889380094889609e-06, "loss": 0.0687, "num_tokens": 18208182.0, "step": 239 }, { "epoch": 0.2994385527136619, "grad_norm": 0.1550942059023908, "learning_rate": 9.888040789018267e-06, "loss": 0.0651, "num_tokens": 18283562.0, "step": 240 }, { "epoch": 0.3006862133499688, "grad_norm": 0.1564914167240467, "learning_rate": 9.886693526302657e-06, "loss": 0.0676, "num_tokens": 18360629.0, "step": 241 }, { "epoch": 0.30193387398627575, "grad_norm": 0.15787622574156354, "learning_rate": 9.885338309185775e-06, "loss": 0.0662, "num_tokens": 18437479.0, "step": 242 }, { "epoch": 0.3031815346225827, "grad_norm": 0.1630718774763793, "learning_rate": 9.883975140125035e-06, "loss": 0.0644, "num_tokens": 18514069.0, "step": 243 }, { "epoch": 0.30442919525888956, "grad_norm": 0.16940710112136453, "learning_rate": 9.88260402159228e-06, "loss": 0.064, "num_tokens": 18590589.0, "step": 244 }, { "epoch": 0.3056768558951965, "grad_norm": 0.15453236589867414, "learning_rate": 9.88122495607376e-06, "loss": 0.0603, "num_tokens": 18666004.0, "step": 245 }, { "epoch": 0.30692451653150343, "grad_norm": 0.15529247967534632, "learning_rate": 9.879837946070138e-06, "loss": 0.0644, "num_tokens": 18743017.0, "step": 246 }, { "epoch": 0.30817217716781037, "grad_norm": 0.1696878843200201, "learning_rate": 9.878442994096481e-06, "loss": 0.0691, "num_tokens": 18820248.0, "step": 247 }, { "epoch": 0.3094198378041173, "grad_norm": 0.1607452517295558, "learning_rate": 9.87704010268226e-06, "loss": 0.0614, "num_tokens": 18895528.0, "step": 248 }, { "epoch": 0.3106674984404242, "grad_norm": 0.15937732385503722, "learning_rate": 9.87562927437134e-06, "loss": 0.0652, "num_tokens": 18971471.0, "step": 249 }, { "epoch": 0.3119151590767311, "grad_norm": 0.15886046706962062, "learning_rate": 9.87421051172198e-06, "loss": 0.0667, "num_tokens": 19048325.0, "step": 250 }, { "epoch": 0.31316281971303805, "grad_norm": 0.14717021560879917, "learning_rate": 9.872783817306827e-06, "loss": 0.0625, "num_tokens": 19124561.0, "step": 251 }, { "epoch": 0.314410480349345, "grad_norm": 0.17052344641660577, "learning_rate": 9.871349193712905e-06, "loss": 0.0726, "num_tokens": 19201931.0, "step": 252 }, { "epoch": 0.3156581409856519, "grad_norm": 0.15865459451726951, "learning_rate": 9.869906643541625e-06, "loss": 0.0676, "num_tokens": 19278610.0, "step": 253 }, { "epoch": 0.3169058016219588, "grad_norm": 0.15293715680282027, "learning_rate": 9.868456169408763e-06, "loss": 0.0625, "num_tokens": 19354612.0, "step": 254 }, { "epoch": 0.31815346225826574, "grad_norm": 0.1550509113871186, "learning_rate": 9.866997773944469e-06, "loss": 0.0614, "num_tokens": 19430341.0, "step": 255 }, { "epoch": 0.3194011228945727, "grad_norm": 0.16470912732464232, "learning_rate": 9.865531459793254e-06, "loss": 0.0681, "num_tokens": 19510053.0, "step": 256 }, { "epoch": 0.3206487835308796, "grad_norm": 0.1725567554413264, "learning_rate": 9.864057229613988e-06, "loss": 0.0646, "num_tokens": 19587458.0, "step": 257 }, { "epoch": 0.32189644416718655, "grad_norm": 0.1649145739650761, "learning_rate": 9.862575086079897e-06, "loss": 0.0594, "num_tokens": 19663013.0, "step": 258 }, { "epoch": 0.3231441048034934, "grad_norm": 0.15548449511521806, "learning_rate": 9.861085031878556e-06, "loss": 0.0678, "num_tokens": 19738422.0, "step": 259 }, { "epoch": 0.32439176543980036, "grad_norm": 0.18027689043783654, "learning_rate": 9.859587069711883e-06, "loss": 0.0709, "num_tokens": 19815943.0, "step": 260 }, { "epoch": 0.3256394260761073, "grad_norm": 0.17995034139406604, "learning_rate": 9.858081202296133e-06, "loss": 0.0649, "num_tokens": 19891473.0, "step": 261 }, { "epoch": 0.32688708671241423, "grad_norm": 0.16879224411874624, "learning_rate": 9.856567432361903e-06, "loss": 0.0602, "num_tokens": 19966715.0, "step": 262 }, { "epoch": 0.32813474734872117, "grad_norm": 0.1616050002907175, "learning_rate": 9.855045762654115e-06, "loss": 0.0658, "num_tokens": 20043692.0, "step": 263 }, { "epoch": 0.32938240798502805, "grad_norm": 0.15471903513941546, "learning_rate": 9.853516195932014e-06, "loss": 0.0648, "num_tokens": 20120146.0, "step": 264 }, { "epoch": 0.330630068621335, "grad_norm": 0.16554661598465897, "learning_rate": 9.851978734969168e-06, "loss": 0.0674, "num_tokens": 20197768.0, "step": 265 }, { "epoch": 0.3318777292576419, "grad_norm": 0.16044112115366635, "learning_rate": 9.850433382553457e-06, "loss": 0.0644, "num_tokens": 20274631.0, "step": 266 }, { "epoch": 0.33312538989394885, "grad_norm": 0.15623389055861656, "learning_rate": 9.848880141487076e-06, "loss": 0.0586, "num_tokens": 20349379.0, "step": 267 }, { "epoch": 0.3343730505302558, "grad_norm": 0.16320331916068953, "learning_rate": 9.847319014586517e-06, "loss": 0.068, "num_tokens": 20425517.0, "step": 268 }, { "epoch": 0.33562071116656267, "grad_norm": 0.17410549807002865, "learning_rate": 9.845750004682576e-06, "loss": 0.0704, "num_tokens": 20502237.0, "step": 269 }, { "epoch": 0.3368683718028696, "grad_norm": 0.16317259612786264, "learning_rate": 9.844173114620342e-06, "loss": 0.0621, "num_tokens": 20577801.0, "step": 270 }, { "epoch": 0.33811603243917654, "grad_norm": 0.18438429356646444, "learning_rate": 9.842588347259192e-06, "loss": 0.0702, "num_tokens": 20656419.0, "step": 271 }, { "epoch": 0.3393636930754835, "grad_norm": 0.16727261944128552, "learning_rate": 9.84099570547279e-06, "loss": 0.0628, "num_tokens": 20731879.0, "step": 272 }, { "epoch": 0.3406113537117904, "grad_norm": 0.1568241324741727, "learning_rate": 9.839395192149077e-06, "loss": 0.0622, "num_tokens": 20808391.0, "step": 273 }, { "epoch": 0.3418590143480973, "grad_norm": 0.16750714409000023, "learning_rate": 9.837786810190268e-06, "loss": 0.0742, "num_tokens": 20886624.0, "step": 274 }, { "epoch": 0.3431066749844042, "grad_norm": 0.1624035418734388, "learning_rate": 9.836170562512844e-06, "loss": 0.0597, "num_tokens": 20961132.0, "step": 275 }, { "epoch": 0.34435433562071116, "grad_norm": 0.1612541838233848, "learning_rate": 9.83454645204755e-06, "loss": 0.0602, "num_tokens": 21037133.0, "step": 276 }, { "epoch": 0.3456019962570181, "grad_norm": 0.14663015839501767, "learning_rate": 9.832914481739391e-06, "loss": 0.0594, "num_tokens": 21113052.0, "step": 277 }, { "epoch": 0.34684965689332503, "grad_norm": 0.15397436093532624, "learning_rate": 9.831274654547623e-06, "loss": 0.0631, "num_tokens": 21189025.0, "step": 278 }, { "epoch": 0.34809731752963197, "grad_norm": 0.16081389996426831, "learning_rate": 9.829626973445745e-06, "loss": 0.0612, "num_tokens": 21264149.0, "step": 279 }, { "epoch": 0.34934497816593885, "grad_norm": 0.16965553989510368, "learning_rate": 9.827971441421504e-06, "loss": 0.0651, "num_tokens": 21341066.0, "step": 280 }, { "epoch": 0.3505926388022458, "grad_norm": 0.1565695656314987, "learning_rate": 9.826308061476878e-06, "loss": 0.0658, "num_tokens": 21417561.0, "step": 281 }, { "epoch": 0.3518402994385527, "grad_norm": 0.16773412549123703, "learning_rate": 9.824636836628078e-06, "loss": 0.0598, "num_tokens": 21492874.0, "step": 282 }, { "epoch": 0.35308796007485965, "grad_norm": 0.14945204955447638, "learning_rate": 9.822957769905544e-06, "loss": 0.0598, "num_tokens": 21568772.0, "step": 283 }, { "epoch": 0.3543356207111666, "grad_norm": 0.15223520296820806, "learning_rate": 9.821270864353924e-06, "loss": 0.064, "num_tokens": 21645126.0, "step": 284 }, { "epoch": 0.35558328134747347, "grad_norm": 0.15032291291240013, "learning_rate": 9.819576123032092e-06, "loss": 0.0623, "num_tokens": 21720674.0, "step": 285 }, { "epoch": 0.3568309419837804, "grad_norm": 0.15628452605672147, "learning_rate": 9.817873549013127e-06, "loss": 0.0618, "num_tokens": 21797336.0, "step": 286 }, { "epoch": 0.35807860262008734, "grad_norm": 0.16221897267977384, "learning_rate": 9.816163145384308e-06, "loss": 0.0694, "num_tokens": 21875109.0, "step": 287 }, { "epoch": 0.3593262632563943, "grad_norm": 0.16324022520446585, "learning_rate": 9.814444915247115e-06, "loss": 0.0591, "num_tokens": 21951197.0, "step": 288 }, { "epoch": 0.3605739238927012, "grad_norm": 0.16795042723920559, "learning_rate": 9.81271886171722e-06, "loss": 0.0648, "num_tokens": 22028072.0, "step": 289 }, { "epoch": 0.3618215845290081, "grad_norm": 0.16591459092130173, "learning_rate": 9.810984987924477e-06, "loss": 0.0645, "num_tokens": 22104892.0, "step": 290 }, { "epoch": 0.363069245165315, "grad_norm": 0.17711965952141914, "learning_rate": 9.809243297012923e-06, "loss": 0.0646, "num_tokens": 22181330.0, "step": 291 }, { "epoch": 0.36431690580162196, "grad_norm": 0.16517263676833355, "learning_rate": 9.807493792140774e-06, "loss": 0.0592, "num_tokens": 22256931.0, "step": 292 }, { "epoch": 0.3655645664379289, "grad_norm": 0.16613266514267386, "learning_rate": 9.805736476480407e-06, "loss": 0.083, "num_tokens": 22333756.0, "step": 293 }, { "epoch": 0.36681222707423583, "grad_norm": 0.17374361101676075, "learning_rate": 9.803971353218367e-06, "loss": 0.065, "num_tokens": 22409725.0, "step": 294 }, { "epoch": 0.3680598877105427, "grad_norm": 0.16484309467609493, "learning_rate": 9.802198425555358e-06, "loss": 0.0644, "num_tokens": 22485868.0, "step": 295 }, { "epoch": 0.36930754834684965, "grad_norm": 0.15491884230683195, "learning_rate": 9.800417696706234e-06, "loss": 0.0631, "num_tokens": 22562139.0, "step": 296 }, { "epoch": 0.3705552089831566, "grad_norm": 0.16556597167488193, "learning_rate": 9.798629169899992e-06, "loss": 0.0625, "num_tokens": 22638781.0, "step": 297 }, { "epoch": 0.3718028696194635, "grad_norm": 0.15406768450216785, "learning_rate": 9.796832848379775e-06, "loss": 0.065, "num_tokens": 22715605.0, "step": 298 }, { "epoch": 0.37305053025577045, "grad_norm": 0.15504153849950547, "learning_rate": 9.795028735402853e-06, "loss": 0.062, "num_tokens": 22791830.0, "step": 299 }, { "epoch": 0.37429819089207733, "grad_norm": 0.15412320004002014, "learning_rate": 9.79321683424063e-06, "loss": 0.0615, "num_tokens": 22867531.0, "step": 300 }, { "epoch": 0.37554585152838427, "grad_norm": 0.15756901782309857, "learning_rate": 9.791397148178632e-06, "loss": 0.0553, "num_tokens": 22943631.0, "step": 301 }, { "epoch": 0.3767935121646912, "grad_norm": 0.1531898630456781, "learning_rate": 9.789569680516497e-06, "loss": 0.0637, "num_tokens": 23019403.0, "step": 302 }, { "epoch": 0.37804117280099814, "grad_norm": 0.1515173653286233, "learning_rate": 9.78773443456798e-06, "loss": 0.0622, "num_tokens": 23095187.0, "step": 303 }, { "epoch": 0.3792888334373051, "grad_norm": 0.15625736804819645, "learning_rate": 9.785891413660931e-06, "loss": 0.0589, "num_tokens": 23170661.0, "step": 304 }, { "epoch": 0.38053649407361195, "grad_norm": 0.15883217445897935, "learning_rate": 9.784040621137308e-06, "loss": 0.0599, "num_tokens": 23247363.0, "step": 305 }, { "epoch": 0.3817841547099189, "grad_norm": 0.14990678103187605, "learning_rate": 9.78218206035316e-06, "loss": 0.0565, "num_tokens": 23322963.0, "step": 306 }, { "epoch": 0.3830318153462258, "grad_norm": 0.15000063559454646, "learning_rate": 9.780315734678612e-06, "loss": 0.0555, "num_tokens": 23398550.0, "step": 307 }, { "epoch": 0.38427947598253276, "grad_norm": 0.15831644899679806, "learning_rate": 9.778441647497882e-06, "loss": 0.0618, "num_tokens": 23474279.0, "step": 308 }, { "epoch": 0.3855271366188397, "grad_norm": 0.15900356422471373, "learning_rate": 9.776559802209255e-06, "loss": 0.0598, "num_tokens": 23551400.0, "step": 309 }, { "epoch": 0.3867747972551466, "grad_norm": 0.16055646774167992, "learning_rate": 9.774670202225084e-06, "loss": 0.0604, "num_tokens": 23627330.0, "step": 310 }, { "epoch": 0.3880224578914535, "grad_norm": 0.1591984868303217, "learning_rate": 9.772772850971788e-06, "loss": 0.0605, "num_tokens": 23704675.0, "step": 311 }, { "epoch": 0.38927011852776044, "grad_norm": 0.1503919917904232, "learning_rate": 9.770867751889837e-06, "loss": 0.0621, "num_tokens": 23781138.0, "step": 312 }, { "epoch": 0.3905177791640674, "grad_norm": 0.15795257667206367, "learning_rate": 9.76895490843375e-06, "loss": 0.0611, "num_tokens": 23856629.0, "step": 313 }, { "epoch": 0.3917654398003743, "grad_norm": 0.15149709424372163, "learning_rate": 9.767034324072091e-06, "loss": 0.0567, "num_tokens": 23932154.0, "step": 314 }, { "epoch": 0.3930131004366812, "grad_norm": 0.14733318425088304, "learning_rate": 9.76510600228746e-06, "loss": 0.0571, "num_tokens": 24007368.0, "step": 315 }, { "epoch": 0.39426076107298813, "grad_norm": 0.15325605310711474, "learning_rate": 9.763169946576488e-06, "loss": 0.0621, "num_tokens": 24083790.0, "step": 316 }, { "epoch": 0.39550842170929507, "grad_norm": 0.16166497874502006, "learning_rate": 9.76122616044983e-06, "loss": 0.061, "num_tokens": 24159562.0, "step": 317 }, { "epoch": 0.396756082345602, "grad_norm": 0.17868899974287344, "learning_rate": 9.759274647432156e-06, "loss": 0.0766, "num_tokens": 24237537.0, "step": 318 }, { "epoch": 0.39800374298190894, "grad_norm": 0.14894383219801544, "learning_rate": 9.75731541106215e-06, "loss": 0.0575, "num_tokens": 24314398.0, "step": 319 }, { "epoch": 0.39925140361821587, "grad_norm": 0.16352349044368675, "learning_rate": 9.755348454892498e-06, "loss": 0.0653, "num_tokens": 24390811.0, "step": 320 }, { "epoch": 0.40049906425452275, "grad_norm": 0.1538556452099385, "learning_rate": 9.753373782489887e-06, "loss": 0.0618, "num_tokens": 24467181.0, "step": 321 }, { "epoch": 0.4017467248908297, "grad_norm": 0.15876023490938346, "learning_rate": 9.751391397434996e-06, "loss": 0.0635, "num_tokens": 24543387.0, "step": 322 }, { "epoch": 0.4029943855271366, "grad_norm": 0.1715301439726509, "learning_rate": 9.74940130332249e-06, "loss": 0.0595, "num_tokens": 24620711.0, "step": 323 }, { "epoch": 0.40424204616344356, "grad_norm": 0.1563923945268615, "learning_rate": 9.747403503761006e-06, "loss": 0.0623, "num_tokens": 24697604.0, "step": 324 }, { "epoch": 0.4054897067997505, "grad_norm": 0.16071610421961047, "learning_rate": 9.74539800237316e-06, "loss": 0.0626, "num_tokens": 24774256.0, "step": 325 }, { "epoch": 0.4067373674360574, "grad_norm": 0.14769896169660982, "learning_rate": 9.743384802795535e-06, "loss": 0.0584, "num_tokens": 24849764.0, "step": 326 }, { "epoch": 0.4079850280723643, "grad_norm": 0.16996418544343941, "learning_rate": 9.741363908678669e-06, "loss": 0.0658, "num_tokens": 24926572.0, "step": 327 }, { "epoch": 0.40923268870867124, "grad_norm": 0.14800046537258346, "learning_rate": 9.739335323687052e-06, "loss": 0.0587, "num_tokens": 25002238.0, "step": 328 }, { "epoch": 0.4104803493449782, "grad_norm": 0.16207435268349787, "learning_rate": 9.737299051499125e-06, "loss": 0.0668, "num_tokens": 25083450.0, "step": 329 }, { "epoch": 0.4117280099812851, "grad_norm": 0.15391056645444173, "learning_rate": 9.735255095807263e-06, "loss": 0.0645, "num_tokens": 25160430.0, "step": 330 }, { "epoch": 0.412975670617592, "grad_norm": 0.14555280500195275, "learning_rate": 9.733203460317777e-06, "loss": 0.0599, "num_tokens": 25236437.0, "step": 331 }, { "epoch": 0.41422333125389893, "grad_norm": 0.15489125686680358, "learning_rate": 9.731144148750898e-06, "loss": 0.0616, "num_tokens": 25312474.0, "step": 332 }, { "epoch": 0.41547099189020587, "grad_norm": 0.15739674799028386, "learning_rate": 9.729077164840784e-06, "loss": 0.0632, "num_tokens": 25387735.0, "step": 333 }, { "epoch": 0.4167186525265128, "grad_norm": 0.16457084948558323, "learning_rate": 9.727002512335502e-06, "loss": 0.0604, "num_tokens": 25465301.0, "step": 334 }, { "epoch": 0.41796631316281974, "grad_norm": 0.15500114590858863, "learning_rate": 9.724920194997022e-06, "loss": 0.0582, "num_tokens": 25541452.0, "step": 335 }, { "epoch": 0.4192139737991266, "grad_norm": 0.1586434395647764, "learning_rate": 9.722830216601217e-06, "loss": 0.0624, "num_tokens": 25618060.0, "step": 336 }, { "epoch": 0.42046163443543355, "grad_norm": 0.1635042500886567, "learning_rate": 9.720732580937848e-06, "loss": 0.064, "num_tokens": 25695664.0, "step": 337 }, { "epoch": 0.4217092950717405, "grad_norm": 0.17358072634477886, "learning_rate": 9.718627291810561e-06, "loss": 0.0666, "num_tokens": 25772767.0, "step": 338 }, { "epoch": 0.4229569557080474, "grad_norm": 0.1497956417489676, "learning_rate": 9.716514353036884e-06, "loss": 0.0582, "num_tokens": 25851734.0, "step": 339 }, { "epoch": 0.42420461634435436, "grad_norm": 0.15852636906702566, "learning_rate": 9.714393768448214e-06, "loss": 0.0602, "num_tokens": 25928318.0, "step": 340 }, { "epoch": 0.42545227698066124, "grad_norm": 0.1561377489499093, "learning_rate": 9.712265541889809e-06, "loss": 0.0539, "num_tokens": 26003907.0, "step": 341 }, { "epoch": 0.4266999376169682, "grad_norm": 0.16140448999169976, "learning_rate": 9.710129677220788e-06, "loss": 0.061, "num_tokens": 26079673.0, "step": 342 }, { "epoch": 0.4279475982532751, "grad_norm": 0.14579047647672982, "learning_rate": 9.707986178314123e-06, "loss": 0.0519, "num_tokens": 26154655.0, "step": 343 }, { "epoch": 0.42919525888958204, "grad_norm": 0.153370210574673, "learning_rate": 9.705835049056621e-06, "loss": 0.0564, "num_tokens": 26230489.0, "step": 344 }, { "epoch": 0.430442919525889, "grad_norm": 0.14963027725660538, "learning_rate": 9.70367629334893e-06, "loss": 0.0556, "num_tokens": 26306723.0, "step": 345 }, { "epoch": 0.43169058016219586, "grad_norm": 0.15481215278421856, "learning_rate": 9.701509915105527e-06, "loss": 0.0582, "num_tokens": 26382427.0, "step": 346 }, { "epoch": 0.4329382407985028, "grad_norm": 0.15019442485278475, "learning_rate": 9.699335918254714e-06, "loss": 0.0586, "num_tokens": 26457992.0, "step": 347 }, { "epoch": 0.43418590143480973, "grad_norm": 0.14883402591779346, "learning_rate": 9.6971543067386e-06, "loss": 0.0536, "num_tokens": 26533426.0, "step": 348 }, { "epoch": 0.43543356207111666, "grad_norm": 0.1493537080558344, "learning_rate": 9.694965084513106e-06, "loss": 0.0541, "num_tokens": 26608589.0, "step": 349 }, { "epoch": 0.4366812227074236, "grad_norm": 0.1548264803254639, "learning_rate": 9.692768255547957e-06, "loss": 0.0592, "num_tokens": 26685043.0, "step": 350 }, { "epoch": 0.4379288833437305, "grad_norm": 0.15333844028749696, "learning_rate": 9.690563823826666e-06, "loss": 0.0554, "num_tokens": 26760976.0, "step": 351 }, { "epoch": 0.4391765439800374, "grad_norm": 0.1482621937184213, "learning_rate": 9.688351793346533e-06, "loss": 0.0544, "num_tokens": 26837215.0, "step": 352 }, { "epoch": 0.44042420461634435, "grad_norm": 0.15640660305026827, "learning_rate": 9.68613216811864e-06, "loss": 0.0575, "num_tokens": 26913730.0, "step": 353 }, { "epoch": 0.4416718652526513, "grad_norm": 0.15020500164794995, "learning_rate": 9.683904952167837e-06, "loss": 0.0559, "num_tokens": 26989823.0, "step": 354 }, { "epoch": 0.4429195258889582, "grad_norm": 0.17001807801907493, "learning_rate": 9.681670149532739e-06, "loss": 0.0641, "num_tokens": 27066112.0, "step": 355 }, { "epoch": 0.4441671865252651, "grad_norm": 0.1498412004993274, "learning_rate": 9.67942776426572e-06, "loss": 0.0625, "num_tokens": 27144273.0, "step": 356 }, { "epoch": 0.44541484716157204, "grad_norm": 0.1534522416368002, "learning_rate": 9.677177800432903e-06, "loss": 0.0622, "num_tokens": 27220942.0, "step": 357 }, { "epoch": 0.44666250779787897, "grad_norm": 0.1550843398492664, "learning_rate": 9.67492026211415e-06, "loss": 0.0616, "num_tokens": 27297340.0, "step": 358 }, { "epoch": 0.4479101684341859, "grad_norm": 0.15391171863206848, "learning_rate": 9.672655153403064e-06, "loss": 0.0601, "num_tokens": 27373562.0, "step": 359 }, { "epoch": 0.44915782907049284, "grad_norm": 0.15222482131910586, "learning_rate": 9.670382478406967e-06, "loss": 0.0631, "num_tokens": 27450810.0, "step": 360 }, { "epoch": 0.4504054897067998, "grad_norm": 0.15561420752022959, "learning_rate": 9.66810224124691e-06, "loss": 0.056, "num_tokens": 27526478.0, "step": 361 }, { "epoch": 0.45165315034310666, "grad_norm": 0.16603159970455272, "learning_rate": 9.665814446057652e-06, "loss": 0.0556, "num_tokens": 27602783.0, "step": 362 }, { "epoch": 0.4529008109794136, "grad_norm": 0.14679941146709885, "learning_rate": 9.663519096987653e-06, "loss": 0.0553, "num_tokens": 27678758.0, "step": 363 }, { "epoch": 0.45414847161572053, "grad_norm": 0.15949940287162023, "learning_rate": 9.661216198199078e-06, "loss": 0.0614, "num_tokens": 27755052.0, "step": 364 }, { "epoch": 0.45539613225202746, "grad_norm": 0.1524486410430796, "learning_rate": 9.658905753867778e-06, "loss": 0.0642, "num_tokens": 27832939.0, "step": 365 }, { "epoch": 0.4566437928883344, "grad_norm": 0.16992806811708352, "learning_rate": 9.656587768183287e-06, "loss": 0.0578, "num_tokens": 27909228.0, "step": 366 }, { "epoch": 0.4578914535246413, "grad_norm": 0.14352236801233487, "learning_rate": 9.654262245348813e-06, "loss": 0.0571, "num_tokens": 27985198.0, "step": 367 }, { "epoch": 0.4591391141609482, "grad_norm": 0.15126724066365793, "learning_rate": 9.651929189581233e-06, "loss": 0.0589, "num_tokens": 28061265.0, "step": 368 }, { "epoch": 0.46038677479725515, "grad_norm": 0.17699813009849166, "learning_rate": 9.649588605111082e-06, "loss": 0.0607, "num_tokens": 28139144.0, "step": 369 }, { "epoch": 0.4616344354335621, "grad_norm": 0.152592012188265, "learning_rate": 9.647240496182545e-06, "loss": 0.0547, "num_tokens": 28216085.0, "step": 370 }, { "epoch": 0.462882096069869, "grad_norm": 0.1443972817699674, "learning_rate": 9.644884867053455e-06, "loss": 0.0546, "num_tokens": 28291162.0, "step": 371 }, { "epoch": 0.4641297567061759, "grad_norm": 0.14799715036949382, "learning_rate": 9.64252172199528e-06, "loss": 0.0575, "num_tokens": 28367050.0, "step": 372 }, { "epoch": 0.46537741734248284, "grad_norm": 0.15809469352500546, "learning_rate": 9.640151065293117e-06, "loss": 0.0759, "num_tokens": 28442851.0, "step": 373 }, { "epoch": 0.46662507797878977, "grad_norm": 0.14924075699579795, "learning_rate": 9.63777290124568e-06, "loss": 0.0536, "num_tokens": 28518374.0, "step": 374 }, { "epoch": 0.4678727386150967, "grad_norm": 0.14464935799941636, "learning_rate": 9.635387234165303e-06, "loss": 0.0558, "num_tokens": 28594304.0, "step": 375 }, { "epoch": 0.46912039925140364, "grad_norm": 0.15717484410743035, "learning_rate": 9.632994068377916e-06, "loss": 0.0602, "num_tokens": 28670659.0, "step": 376 }, { "epoch": 0.4703680598877105, "grad_norm": 0.14844579724039525, "learning_rate": 9.63059340822306e-06, "loss": 0.0598, "num_tokens": 28746845.0, "step": 377 }, { "epoch": 0.47161572052401746, "grad_norm": 0.1594886034213672, "learning_rate": 9.628185258053852e-06, "loss": 0.0609, "num_tokens": 28822799.0, "step": 378 }, { "epoch": 0.4728633811603244, "grad_norm": 0.14548531881469184, "learning_rate": 9.625769622236995e-06, "loss": 0.0572, "num_tokens": 28899534.0, "step": 379 }, { "epoch": 0.4741110417966313, "grad_norm": 0.15452383568551192, "learning_rate": 9.623346505152771e-06, "loss": 0.0588, "num_tokens": 28975820.0, "step": 380 }, { "epoch": 0.47535870243293826, "grad_norm": 0.15002724218298902, "learning_rate": 9.620915911195021e-06, "loss": 0.0588, "num_tokens": 29053347.0, "step": 381 }, { "epoch": 0.47660636306924514, "grad_norm": 0.15471346976150332, "learning_rate": 9.618477844771147e-06, "loss": 0.0557, "num_tokens": 29128816.0, "step": 382 }, { "epoch": 0.4778540237055521, "grad_norm": 0.1550002737473579, "learning_rate": 9.6160323103021e-06, "loss": 0.0591, "num_tokens": 29204981.0, "step": 383 }, { "epoch": 0.479101684341859, "grad_norm": 0.15149835324532465, "learning_rate": 9.613579312222377e-06, "loss": 0.0542, "num_tokens": 29280430.0, "step": 384 }, { "epoch": 0.48034934497816595, "grad_norm": 0.16201311454989606, "learning_rate": 9.611118854979998e-06, "loss": 0.0597, "num_tokens": 29357489.0, "step": 385 }, { "epoch": 0.4815970056144729, "grad_norm": 0.14974683349924134, "learning_rate": 9.608650943036522e-06, "loss": 0.0557, "num_tokens": 29433257.0, "step": 386 }, { "epoch": 0.48284466625077976, "grad_norm": 0.1559608713414317, "learning_rate": 9.606175580867016e-06, "loss": 0.0567, "num_tokens": 29509590.0, "step": 387 }, { "epoch": 0.4840923268870867, "grad_norm": 0.15336009260629543, "learning_rate": 9.60369277296006e-06, "loss": 0.0561, "num_tokens": 29587782.0, "step": 388 }, { "epoch": 0.48533998752339363, "grad_norm": 0.15936225446679803, "learning_rate": 9.601202523817735e-06, "loss": 0.0525, "num_tokens": 29662613.0, "step": 389 }, { "epoch": 0.48658764815970057, "grad_norm": 0.1472806374581374, "learning_rate": 9.598704837955618e-06, "loss": 0.0546, "num_tokens": 29739245.0, "step": 390 }, { "epoch": 0.4878353087960075, "grad_norm": 0.16550022474863885, "learning_rate": 9.596199719902765e-06, "loss": 0.0616, "num_tokens": 29816275.0, "step": 391 }, { "epoch": 0.4890829694323144, "grad_norm": 0.16013358324619242, "learning_rate": 9.593687174201715e-06, "loss": 0.0566, "num_tokens": 29893157.0, "step": 392 }, { "epoch": 0.4903306300686213, "grad_norm": 0.1516369697660201, "learning_rate": 9.59116720540847e-06, "loss": 0.0547, "num_tokens": 29969852.0, "step": 393 }, { "epoch": 0.49157829070492826, "grad_norm": 0.15400623283535445, "learning_rate": 9.588639818092498e-06, "loss": 0.0574, "num_tokens": 30046961.0, "step": 394 }, { "epoch": 0.4928259513412352, "grad_norm": 0.14891759374811117, "learning_rate": 9.586105016836713e-06, "loss": 0.057, "num_tokens": 30123438.0, "step": 395 }, { "epoch": 0.4940736119775421, "grad_norm": 0.1543457655390604, "learning_rate": 9.58356280623748e-06, "loss": 0.0568, "num_tokens": 30199647.0, "step": 396 }, { "epoch": 0.495321272613849, "grad_norm": 0.16009856686730312, "learning_rate": 9.58101319090459e-06, "loss": 0.056, "num_tokens": 30276427.0, "step": 397 }, { "epoch": 0.49656893325015594, "grad_norm": 0.14536110498931284, "learning_rate": 9.578456175461272e-06, "loss": 0.0539, "num_tokens": 30353184.0, "step": 398 }, { "epoch": 0.4978165938864629, "grad_norm": 0.15472828164087113, "learning_rate": 9.575891764544162e-06, "loss": 0.0587, "num_tokens": 30429481.0, "step": 399 }, { "epoch": 0.4990642545227698, "grad_norm": 0.1600147319756837, "learning_rate": 9.573319962803317e-06, "loss": 0.0564, "num_tokens": 30505682.0, "step": 400 }, { "epoch": 0.5003119151590767, "grad_norm": 0.14645724562558174, "learning_rate": 9.570740774902189e-06, "loss": 0.0533, "num_tokens": 30581573.0, "step": 401 }, { "epoch": 0.5015595757953837, "grad_norm": 0.15711265382750586, "learning_rate": 9.568154205517623e-06, "loss": 0.0577, "num_tokens": 30657816.0, "step": 402 }, { "epoch": 0.5028072364316906, "grad_norm": 0.15283501905181346, "learning_rate": 9.565560259339856e-06, "loss": 0.0609, "num_tokens": 30733986.0, "step": 403 }, { "epoch": 0.5040548970679976, "grad_norm": 0.16340162533953054, "learning_rate": 9.562958941072491e-06, "loss": 0.0639, "num_tokens": 30812920.0, "step": 404 }, { "epoch": 0.5053025577043044, "grad_norm": 0.14460855761419414, "learning_rate": 9.560350255432508e-06, "loss": 0.0533, "num_tokens": 30889729.0, "step": 405 }, { "epoch": 0.5065502183406113, "grad_norm": 0.15949124398271095, "learning_rate": 9.557734207150243e-06, "loss": 0.0581, "num_tokens": 30965816.0, "step": 406 }, { "epoch": 0.5077978789769183, "grad_norm": 0.13801699119897257, "learning_rate": 9.55511080096938e-06, "loss": 0.0538, "num_tokens": 31041927.0, "step": 407 }, { "epoch": 0.5090455396132252, "grad_norm": 0.13912012303471716, "learning_rate": 9.552480041646949e-06, "loss": 0.0531, "num_tokens": 31117687.0, "step": 408 }, { "epoch": 0.5102932002495322, "grad_norm": 0.1629842644162694, "learning_rate": 9.549841933953308e-06, "loss": 0.0589, "num_tokens": 31194057.0, "step": 409 }, { "epoch": 0.511540860885839, "grad_norm": 0.15827973713090576, "learning_rate": 9.547196482672148e-06, "loss": 0.0588, "num_tokens": 31270979.0, "step": 410 }, { "epoch": 0.5127885215221459, "grad_norm": 0.15401143305786705, "learning_rate": 9.544543692600473e-06, "loss": 0.0572, "num_tokens": 31347683.0, "step": 411 }, { "epoch": 0.5140361821584529, "grad_norm": 0.1658146704954777, "learning_rate": 9.541883568548588e-06, "loss": 0.0596, "num_tokens": 31424509.0, "step": 412 }, { "epoch": 0.5152838427947598, "grad_norm": 0.14602113674983888, "learning_rate": 9.539216115340106e-06, "loss": 0.0546, "num_tokens": 31500007.0, "step": 413 }, { "epoch": 0.5165315034310668, "grad_norm": 0.1568068093453694, "learning_rate": 9.536541337811923e-06, "loss": 0.0567, "num_tokens": 31576786.0, "step": 414 }, { "epoch": 0.5177791640673737, "grad_norm": 0.14427631505975017, "learning_rate": 9.533859240814221e-06, "loss": 0.0555, "num_tokens": 31652782.0, "step": 415 }, { "epoch": 0.5190268247036806, "grad_norm": 0.13640071991365751, "learning_rate": 9.531169829210452e-06, "loss": 0.0499, "num_tokens": 31728711.0, "step": 416 }, { "epoch": 0.5202744853399875, "grad_norm": 0.15127104928479138, "learning_rate": 9.528473107877333e-06, "loss": 0.0566, "num_tokens": 31806014.0, "step": 417 }, { "epoch": 0.5215221459762944, "grad_norm": 0.14309135833657355, "learning_rate": 9.525769081704835e-06, "loss": 0.0489, "num_tokens": 31880741.0, "step": 418 }, { "epoch": 0.5227698066126014, "grad_norm": 0.15539877823691686, "learning_rate": 9.523057755596174e-06, "loss": 0.0529, "num_tokens": 31956173.0, "step": 419 }, { "epoch": 0.5240174672489083, "grad_norm": 0.17228729683993205, "learning_rate": 9.520339134467803e-06, "loss": 0.0547, "num_tokens": 32032399.0, "step": 420 }, { "epoch": 0.5252651278852152, "grad_norm": 0.1583895665904868, "learning_rate": 9.517613223249402e-06, "loss": 0.0533, "num_tokens": 32108181.0, "step": 421 }, { "epoch": 0.5265127885215222, "grad_norm": 0.15998682022689042, "learning_rate": 9.514880026883877e-06, "loss": 0.0591, "num_tokens": 32184396.0, "step": 422 }, { "epoch": 0.527760449157829, "grad_norm": 0.1565903843630229, "learning_rate": 9.512139550327338e-06, "loss": 0.0549, "num_tokens": 32259958.0, "step": 423 }, { "epoch": 0.529008109794136, "grad_norm": 0.15825635032436505, "learning_rate": 9.509391798549091e-06, "loss": 0.0499, "num_tokens": 32335779.0, "step": 424 }, { "epoch": 0.5302557704304429, "grad_norm": 0.15151788807905717, "learning_rate": 9.50663677653165e-06, "loss": 0.0613, "num_tokens": 32412537.0, "step": 425 }, { "epoch": 0.5315034310667498, "grad_norm": 0.14537670619366175, "learning_rate": 9.503874489270697e-06, "loss": 0.0546, "num_tokens": 32489973.0, "step": 426 }, { "epoch": 0.5327510917030568, "grad_norm": 0.14119646580930728, "learning_rate": 9.501104941775094e-06, "loss": 0.0523, "num_tokens": 32566189.0, "step": 427 }, { "epoch": 0.5339987523393637, "grad_norm": 0.15182301317440733, "learning_rate": 9.49832813906687e-06, "loss": 0.054, "num_tokens": 32642863.0, "step": 428 }, { "epoch": 0.5352464129756707, "grad_norm": 0.16184584308403277, "learning_rate": 9.495544086181204e-06, "loss": 0.0513, "num_tokens": 32718435.0, "step": 429 }, { "epoch": 0.5364940736119775, "grad_norm": 0.14525021676931954, "learning_rate": 9.49275278816643e-06, "loss": 0.0566, "num_tokens": 32794713.0, "step": 430 }, { "epoch": 0.5377417342482844, "grad_norm": 0.14780472666005015, "learning_rate": 9.489954250084011e-06, "loss": 0.0536, "num_tokens": 32872091.0, "step": 431 }, { "epoch": 0.5389893948845914, "grad_norm": 0.15645539651072504, "learning_rate": 9.487148477008545e-06, "loss": 0.0538, "num_tokens": 32947802.0, "step": 432 }, { "epoch": 0.5402370555208983, "grad_norm": 0.14181033938548343, "learning_rate": 9.484335474027744e-06, "loss": 0.0536, "num_tokens": 33025155.0, "step": 433 }, { "epoch": 0.5414847161572053, "grad_norm": 0.15415741993862173, "learning_rate": 9.481515246242435e-06, "loss": 0.0561, "num_tokens": 33101169.0, "step": 434 }, { "epoch": 0.5427323767935122, "grad_norm": 0.1490145272995484, "learning_rate": 9.478687798766544e-06, "loss": 0.0549, "num_tokens": 33178375.0, "step": 435 }, { "epoch": 0.543980037429819, "grad_norm": 0.15402426723128798, "learning_rate": 9.475853136727086e-06, "loss": 0.0546, "num_tokens": 33253461.0, "step": 436 }, { "epoch": 0.545227698066126, "grad_norm": 0.15545840379696368, "learning_rate": 9.473011265264159e-06, "loss": 0.0536, "num_tokens": 33329434.0, "step": 437 }, { "epoch": 0.5464753587024329, "grad_norm": 0.14007805927580225, "learning_rate": 9.470162189530938e-06, "loss": 0.0582, "num_tokens": 33405693.0, "step": 438 }, { "epoch": 0.5477230193387399, "grad_norm": 0.14677878355239657, "learning_rate": 9.467305914693658e-06, "loss": 0.0546, "num_tokens": 33481868.0, "step": 439 }, { "epoch": 0.5489706799750468, "grad_norm": 0.13752170718015977, "learning_rate": 9.464442445931605e-06, "loss": 0.051, "num_tokens": 33557214.0, "step": 440 }, { "epoch": 0.5502183406113537, "grad_norm": 0.15178088205554877, "learning_rate": 9.461571788437119e-06, "loss": 0.0576, "num_tokens": 33632980.0, "step": 441 }, { "epoch": 0.5514660012476607, "grad_norm": 0.14398063856278132, "learning_rate": 9.458693947415564e-06, "loss": 0.0535, "num_tokens": 33710087.0, "step": 442 }, { "epoch": 0.5527136618839675, "grad_norm": 0.14539327976112087, "learning_rate": 9.455808928085339e-06, "loss": 0.0519, "num_tokens": 33785845.0, "step": 443 }, { "epoch": 0.5539613225202745, "grad_norm": 0.1450732701024791, "learning_rate": 9.452916735677857e-06, "loss": 0.0556, "num_tokens": 33861967.0, "step": 444 }, { "epoch": 0.5552089831565814, "grad_norm": 0.13986151085445409, "learning_rate": 9.450017375437534e-06, "loss": 0.0501, "num_tokens": 33937887.0, "step": 445 }, { "epoch": 0.5564566437928883, "grad_norm": 0.16219894351016226, "learning_rate": 9.44711085262179e-06, "loss": 0.0573, "num_tokens": 34014701.0, "step": 446 }, { "epoch": 0.5577043044291953, "grad_norm": 0.15868863153077056, "learning_rate": 9.444197172501025e-06, "loss": 0.0559, "num_tokens": 34090915.0, "step": 447 }, { "epoch": 0.5589519650655022, "grad_norm": 0.15055297288103786, "learning_rate": 9.441276340358624e-06, "loss": 0.0515, "num_tokens": 34167324.0, "step": 448 }, { "epoch": 0.5601996257018091, "grad_norm": 0.15550675387011964, "learning_rate": 9.438348361490938e-06, "loss": 0.056, "num_tokens": 34243404.0, "step": 449 }, { "epoch": 0.561447286338116, "grad_norm": 0.14516804951124496, "learning_rate": 9.43541324120728e-06, "loss": 0.0521, "num_tokens": 34319033.0, "step": 450 }, { "epoch": 0.5626949469744229, "grad_norm": 0.14219558052085499, "learning_rate": 9.432470984829908e-06, "loss": 0.0582, "num_tokens": 34397287.0, "step": 451 }, { "epoch": 0.5639426076107299, "grad_norm": 0.14782808411123538, "learning_rate": 9.429521597694023e-06, "loss": 0.0537, "num_tokens": 34473491.0, "step": 452 }, { "epoch": 0.5651902682470368, "grad_norm": 0.14684446756874775, "learning_rate": 9.426565085147755e-06, "loss": 0.055, "num_tokens": 34550056.0, "step": 453 }, { "epoch": 0.5664379288833438, "grad_norm": 0.1476528606816408, "learning_rate": 9.423601452552153e-06, "loss": 0.0528, "num_tokens": 34626574.0, "step": 454 }, { "epoch": 0.5676855895196506, "grad_norm": 0.15032224550644455, "learning_rate": 9.420630705281182e-06, "loss": 0.0521, "num_tokens": 34702812.0, "step": 455 }, { "epoch": 0.5689332501559575, "grad_norm": 0.15507617585697697, "learning_rate": 9.417652848721704e-06, "loss": 0.0555, "num_tokens": 34779202.0, "step": 456 }, { "epoch": 0.5701809107922645, "grad_norm": 0.13946889304429896, "learning_rate": 9.41466788827347e-06, "loss": 0.0493, "num_tokens": 34854641.0, "step": 457 }, { "epoch": 0.5714285714285714, "grad_norm": 0.15804626765822227, "learning_rate": 9.411675829349119e-06, "loss": 0.0542, "num_tokens": 34931173.0, "step": 458 }, { "epoch": 0.5726762320648784, "grad_norm": 0.15864241799013198, "learning_rate": 9.408676677374158e-06, "loss": 0.0554, "num_tokens": 35007346.0, "step": 459 }, { "epoch": 0.5739238927011853, "grad_norm": 0.1486978082092752, "learning_rate": 9.405670437786953e-06, "loss": 0.0503, "num_tokens": 35082953.0, "step": 460 }, { "epoch": 0.5751715533374921, "grad_norm": 0.15936338978994993, "learning_rate": 9.402657116038728e-06, "loss": 0.0557, "num_tokens": 35159654.0, "step": 461 }, { "epoch": 0.5764192139737991, "grad_norm": 0.1419878547063181, "learning_rate": 9.399636717593545e-06, "loss": 0.0518, "num_tokens": 35234785.0, "step": 462 }, { "epoch": 0.577666874610106, "grad_norm": 0.15864527117251057, "learning_rate": 9.3966092479283e-06, "loss": 0.0552, "num_tokens": 35310616.0, "step": 463 }, { "epoch": 0.578914535246413, "grad_norm": 0.1503705578277098, "learning_rate": 9.39357471253271e-06, "loss": 0.0505, "num_tokens": 35385435.0, "step": 464 }, { "epoch": 0.5801621958827199, "grad_norm": 0.16117755152514393, "learning_rate": 9.390533116909305e-06, "loss": 0.0579, "num_tokens": 35463241.0, "step": 465 }, { "epoch": 0.5814098565190269, "grad_norm": 0.13917139410745755, "learning_rate": 9.387484466573417e-06, "loss": 0.0501, "num_tokens": 35538049.0, "step": 466 }, { "epoch": 0.5826575171553338, "grad_norm": 0.15269748868147934, "learning_rate": 9.38442876705317e-06, "loss": 0.0545, "num_tokens": 35613417.0, "step": 467 }, { "epoch": 0.5839051777916406, "grad_norm": 0.14011033067669287, "learning_rate": 9.381366023889475e-06, "loss": 0.0507, "num_tokens": 35689560.0, "step": 468 }, { "epoch": 0.5851528384279476, "grad_norm": 0.1605155443204243, "learning_rate": 9.378296242636002e-06, "loss": 0.0511, "num_tokens": 35764777.0, "step": 469 }, { "epoch": 0.5864004990642545, "grad_norm": 0.14318225311143812, "learning_rate": 9.375219428859202e-06, "loss": 0.0523, "num_tokens": 35840524.0, "step": 470 }, { "epoch": 0.5876481597005615, "grad_norm": 0.15081095543187126, "learning_rate": 9.372135588138262e-06, "loss": 0.0541, "num_tokens": 35919754.0, "step": 471 }, { "epoch": 0.5888958203368684, "grad_norm": 0.167227205565762, "learning_rate": 9.369044726065121e-06, "loss": 0.0524, "num_tokens": 35995857.0, "step": 472 }, { "epoch": 0.5901434809731753, "grad_norm": 0.14110886145114976, "learning_rate": 9.365946848244445e-06, "loss": 0.0512, "num_tokens": 36072256.0, "step": 473 }, { "epoch": 0.5913911416094823, "grad_norm": 0.1566742798779281, "learning_rate": 9.362841960293622e-06, "loss": 0.0546, "num_tokens": 36149372.0, "step": 474 }, { "epoch": 0.5926388022457891, "grad_norm": 0.1457390981400815, "learning_rate": 9.359730067842753e-06, "loss": 0.0539, "num_tokens": 36225637.0, "step": 475 }, { "epoch": 0.5938864628820961, "grad_norm": 0.14052219234375607, "learning_rate": 9.35661117653464e-06, "loss": 0.0491, "num_tokens": 36300751.0, "step": 476 }, { "epoch": 0.595134123518403, "grad_norm": 0.14585199639575464, "learning_rate": 9.353485292024775e-06, "loss": 0.0515, "num_tokens": 36377358.0, "step": 477 }, { "epoch": 0.5963817841547099, "grad_norm": 0.13602504917204367, "learning_rate": 9.35035241998133e-06, "loss": 0.0483, "num_tokens": 36452580.0, "step": 478 }, { "epoch": 0.5976294447910169, "grad_norm": 0.16038197120224465, "learning_rate": 9.347212566085153e-06, "loss": 0.0537, "num_tokens": 36527842.0, "step": 479 }, { "epoch": 0.5988771054273238, "grad_norm": 0.14394564870839627, "learning_rate": 9.344065736029746e-06, "loss": 0.0502, "num_tokens": 36603703.0, "step": 480 }, { "epoch": 0.6001247660636307, "grad_norm": 0.14127804811329153, "learning_rate": 9.34091193552126e-06, "loss": 0.0539, "num_tokens": 36680671.0, "step": 481 }, { "epoch": 0.6013724266999376, "grad_norm": 0.14612515735227102, "learning_rate": 9.337751170278495e-06, "loss": 0.0532, "num_tokens": 36757176.0, "step": 482 }, { "epoch": 0.6026200873362445, "grad_norm": 0.16129333056420253, "learning_rate": 9.334583446032866e-06, "loss": 0.0528, "num_tokens": 36834365.0, "step": 483 }, { "epoch": 0.6038677479725515, "grad_norm": 0.14048550830562176, "learning_rate": 9.331408768528423e-06, "loss": 0.0505, "num_tokens": 36911009.0, "step": 484 }, { "epoch": 0.6051154086088584, "grad_norm": 0.1401119811356599, "learning_rate": 9.328227143521809e-06, "loss": 0.0515, "num_tokens": 36986609.0, "step": 485 }, { "epoch": 0.6063630692451654, "grad_norm": 0.1393551599655052, "learning_rate": 9.325038576782275e-06, "loss": 0.0528, "num_tokens": 37063346.0, "step": 486 }, { "epoch": 0.6076107298814722, "grad_norm": 0.14547884377830422, "learning_rate": 9.321843074091654e-06, "loss": 0.0516, "num_tokens": 37140308.0, "step": 487 }, { "epoch": 0.6088583905177791, "grad_norm": 0.15034063695478633, "learning_rate": 9.318640641244362e-06, "loss": 0.0546, "num_tokens": 37216032.0, "step": 488 }, { "epoch": 0.6101060511540861, "grad_norm": 0.13651732647664802, "learning_rate": 9.315431284047375e-06, "loss": 0.0485, "num_tokens": 37291131.0, "step": 489 }, { "epoch": 0.611353711790393, "grad_norm": 0.13875747516393921, "learning_rate": 9.312215008320228e-06, "loss": 0.0544, "num_tokens": 37366741.0, "step": 490 }, { "epoch": 0.6126013724267, "grad_norm": 0.15178182686688055, "learning_rate": 9.308991819895001e-06, "loss": 0.0482, "num_tokens": 37441390.0, "step": 491 }, { "epoch": 0.6138490330630069, "grad_norm": 0.15247440606751309, "learning_rate": 9.30576172461631e-06, "loss": 0.0525, "num_tokens": 37518266.0, "step": 492 }, { "epoch": 0.6150966936993137, "grad_norm": 0.14750601213608294, "learning_rate": 9.302524728341292e-06, "loss": 0.0535, "num_tokens": 37593756.0, "step": 493 }, { "epoch": 0.6163443543356207, "grad_norm": 0.15128573856082372, "learning_rate": 9.299280836939602e-06, "loss": 0.0511, "num_tokens": 37670443.0, "step": 494 }, { "epoch": 0.6175920149719276, "grad_norm": 0.14776419777529295, "learning_rate": 9.296030056293394e-06, "loss": 0.0534, "num_tokens": 37746034.0, "step": 495 }, { "epoch": 0.6188396756082346, "grad_norm": 0.13724958742040025, "learning_rate": 9.292772392297316e-06, "loss": 0.0498, "num_tokens": 37821112.0, "step": 496 }, { "epoch": 0.6200873362445415, "grad_norm": 0.1465443892025746, "learning_rate": 9.289507850858498e-06, "loss": 0.0529, "num_tokens": 37898299.0, "step": 497 }, { "epoch": 0.6213349968808484, "grad_norm": 0.13641348218349508, "learning_rate": 9.286236437896538e-06, "loss": 0.0514, "num_tokens": 37974347.0, "step": 498 }, { "epoch": 0.6225826575171554, "grad_norm": 0.14507500345532376, "learning_rate": 9.282958159343502e-06, "loss": 0.0501, "num_tokens": 38050616.0, "step": 499 }, { "epoch": 0.6238303181534622, "grad_norm": 0.15639183855577438, "learning_rate": 9.279673021143895e-06, "loss": 0.0522, "num_tokens": 38127554.0, "step": 500 }, { "epoch": 0.6250779787897692, "grad_norm": 0.14274508358776897, "learning_rate": 9.276381029254668e-06, "loss": 0.0511, "num_tokens": 38203466.0, "step": 501 }, { "epoch": 0.6263256394260761, "grad_norm": 0.157102895207002, "learning_rate": 9.273082189645197e-06, "loss": 0.049, "num_tokens": 38278972.0, "step": 502 }, { "epoch": 0.627573300062383, "grad_norm": 0.16591536991200595, "learning_rate": 9.269776508297272e-06, "loss": 0.0484, "num_tokens": 38354884.0, "step": 503 }, { "epoch": 0.62882096069869, "grad_norm": 0.1422390165622962, "learning_rate": 9.266463991205096e-06, "loss": 0.0505, "num_tokens": 38430978.0, "step": 504 }, { "epoch": 0.6300686213349969, "grad_norm": 0.15110469574389213, "learning_rate": 9.263144644375264e-06, "loss": 0.0564, "num_tokens": 38506717.0, "step": 505 }, { "epoch": 0.6313162819713038, "grad_norm": 0.14145594461673772, "learning_rate": 9.259818473826753e-06, "loss": 0.0522, "num_tokens": 38582821.0, "step": 506 }, { "epoch": 0.6325639426076107, "grad_norm": 0.1433778713312024, "learning_rate": 9.256485485590916e-06, "loss": 0.0514, "num_tokens": 38658439.0, "step": 507 }, { "epoch": 0.6338116032439176, "grad_norm": 0.14504254404583986, "learning_rate": 9.25314568571147e-06, "loss": 0.0496, "num_tokens": 38734314.0, "step": 508 }, { "epoch": 0.6350592638802246, "grad_norm": 0.16233069854732574, "learning_rate": 9.24979908024448e-06, "loss": 0.0533, "num_tokens": 38810766.0, "step": 509 }, { "epoch": 0.6363069245165315, "grad_norm": 0.13653654499950715, "learning_rate": 9.246445675258353e-06, "loss": 0.0485, "num_tokens": 38886088.0, "step": 510 }, { "epoch": 0.6375545851528385, "grad_norm": 0.1446725493124663, "learning_rate": 9.243085476833827e-06, "loss": 0.0511, "num_tokens": 38963707.0, "step": 511 }, { "epoch": 0.6388022457891454, "grad_norm": 0.15005179375811173, "learning_rate": 9.239718491063956e-06, "loss": 0.0532, "num_tokens": 39039002.0, "step": 512 }, { "epoch": 0.6400499064254522, "grad_norm": 0.14813266791028454, "learning_rate": 9.236344724054104e-06, "loss": 0.0555, "num_tokens": 39115410.0, "step": 513 }, { "epoch": 0.6412975670617592, "grad_norm": 0.13508668746062624, "learning_rate": 9.232964181921931e-06, "loss": 0.0485, "num_tokens": 39190976.0, "step": 514 }, { "epoch": 0.6425452276980661, "grad_norm": 0.137197292476883, "learning_rate": 9.22957687079738e-06, "loss": 0.0551, "num_tokens": 39268014.0, "step": 515 }, { "epoch": 0.6437928883343731, "grad_norm": 0.14185888406488098, "learning_rate": 9.22618279682267e-06, "loss": 0.0538, "num_tokens": 39343390.0, "step": 516 }, { "epoch": 0.64504054897068, "grad_norm": 0.15637160791998272, "learning_rate": 9.222781966152284e-06, "loss": 0.0546, "num_tokens": 39419891.0, "step": 517 }, { "epoch": 0.6462882096069869, "grad_norm": 0.15136329659724634, "learning_rate": 9.219374384952955e-06, "loss": 0.0518, "num_tokens": 39496489.0, "step": 518 }, { "epoch": 0.6475358702432938, "grad_norm": 0.14065294824454894, "learning_rate": 9.215960059403657e-06, "loss": 0.0471, "num_tokens": 39572325.0, "step": 519 }, { "epoch": 0.6487835308796007, "grad_norm": 0.1500553366493087, "learning_rate": 9.212538995695597e-06, "loss": 0.053, "num_tokens": 39648939.0, "step": 520 }, { "epoch": 0.6500311915159077, "grad_norm": 0.142043093516072, "learning_rate": 9.209111200032197e-06, "loss": 0.0512, "num_tokens": 39724946.0, "step": 521 }, { "epoch": 0.6512788521522146, "grad_norm": 0.14617390218690907, "learning_rate": 9.205676678629084e-06, "loss": 0.0494, "num_tokens": 39800626.0, "step": 522 }, { "epoch": 0.6525265127885215, "grad_norm": 0.13478534135148082, "learning_rate": 9.202235437714085e-06, "loss": 0.0492, "num_tokens": 39876911.0, "step": 523 }, { "epoch": 0.6537741734248285, "grad_norm": 0.13717863504687297, "learning_rate": 9.198787483527211e-06, "loss": 0.0502, "num_tokens": 39953220.0, "step": 524 }, { "epoch": 0.6550218340611353, "grad_norm": 0.15373758129813972, "learning_rate": 9.195332822320643e-06, "loss": 0.0507, "num_tokens": 40028637.0, "step": 525 }, { "epoch": 0.6562694946974423, "grad_norm": 0.13761979854431663, "learning_rate": 9.191871460358727e-06, "loss": 0.0507, "num_tokens": 40104825.0, "step": 526 }, { "epoch": 0.6575171553337492, "grad_norm": 0.14149107473356692, "learning_rate": 9.188403403917959e-06, "loss": 0.0477, "num_tokens": 40180413.0, "step": 527 }, { "epoch": 0.6587648159700561, "grad_norm": 0.14081135806377892, "learning_rate": 9.184928659286972e-06, "loss": 0.0502, "num_tokens": 40257223.0, "step": 528 }, { "epoch": 0.6600124766063631, "grad_norm": 0.14440533546913695, "learning_rate": 9.181447232766531e-06, "loss": 0.0528, "num_tokens": 40333215.0, "step": 529 }, { "epoch": 0.66126013724267, "grad_norm": 0.14033132815670718, "learning_rate": 9.177959130669512e-06, "loss": 0.052, "num_tokens": 40410361.0, "step": 530 }, { "epoch": 0.662507797878977, "grad_norm": 0.14487287487973752, "learning_rate": 9.174464359320898e-06, "loss": 0.0505, "num_tokens": 40485973.0, "step": 531 }, { "epoch": 0.6637554585152838, "grad_norm": 0.14004587102147162, "learning_rate": 9.170962925057769e-06, "loss": 0.0509, "num_tokens": 40561925.0, "step": 532 }, { "epoch": 0.6650031191515907, "grad_norm": 0.13979887669681615, "learning_rate": 9.167454834229281e-06, "loss": 0.0525, "num_tokens": 40640581.0, "step": 533 }, { "epoch": 0.6662507797878977, "grad_norm": 0.13186354100825973, "learning_rate": 9.163940093196663e-06, "loss": 0.0476, "num_tokens": 40716574.0, "step": 534 }, { "epoch": 0.6674984404242046, "grad_norm": 0.15358376736324988, "learning_rate": 9.160418708333203e-06, "loss": 0.0581, "num_tokens": 40793125.0, "step": 535 }, { "epoch": 0.6687461010605116, "grad_norm": 0.14215032384807272, "learning_rate": 9.156890686024239e-06, "loss": 0.051, "num_tokens": 40869118.0, "step": 536 }, { "epoch": 0.6699937616968185, "grad_norm": 0.14579188342681432, "learning_rate": 9.153356032667138e-06, "loss": 0.0495, "num_tokens": 40946005.0, "step": 537 }, { "epoch": 0.6712414223331253, "grad_norm": 0.1392706758978059, "learning_rate": 9.149814754671296e-06, "loss": 0.0539, "num_tokens": 41023290.0, "step": 538 }, { "epoch": 0.6724890829694323, "grad_norm": 0.13919076738970143, "learning_rate": 9.14626685845812e-06, "loss": 0.0507, "num_tokens": 41100482.0, "step": 539 }, { "epoch": 0.6737367436057392, "grad_norm": 0.14007643948823942, "learning_rate": 9.142712350461021e-06, "loss": 0.0482, "num_tokens": 41175700.0, "step": 540 }, { "epoch": 0.6749844042420462, "grad_norm": 0.142886288926492, "learning_rate": 9.139151237125393e-06, "loss": 0.0517, "num_tokens": 41251124.0, "step": 541 }, { "epoch": 0.6762320648783531, "grad_norm": 0.1328562647414474, "learning_rate": 9.135583524908614e-06, "loss": 0.0498, "num_tokens": 41328351.0, "step": 542 }, { "epoch": 0.67747972551466, "grad_norm": 0.14304761262946025, "learning_rate": 9.132009220280021e-06, "loss": 0.0545, "num_tokens": 41404625.0, "step": 543 }, { "epoch": 0.678727386150967, "grad_norm": 0.14231842272684792, "learning_rate": 9.128428329720911e-06, "loss": 0.052, "num_tokens": 41480638.0, "step": 544 }, { "epoch": 0.6799750467872738, "grad_norm": 0.14373073570586645, "learning_rate": 9.12484085972452e-06, "loss": 0.049, "num_tokens": 41556512.0, "step": 545 }, { "epoch": 0.6812227074235808, "grad_norm": 0.1418611549255363, "learning_rate": 9.121246816796017e-06, "loss": 0.0468, "num_tokens": 41633777.0, "step": 546 }, { "epoch": 0.6824703680598877, "grad_norm": 0.14393902575169928, "learning_rate": 9.117646207452487e-06, "loss": 0.0551, "num_tokens": 41710870.0, "step": 547 }, { "epoch": 0.6837180286961946, "grad_norm": 0.128685650110848, "learning_rate": 9.114039038222922e-06, "loss": 0.0505, "num_tokens": 41787983.0, "step": 548 }, { "epoch": 0.6849656893325016, "grad_norm": 0.14694897850884, "learning_rate": 9.110425315648212e-06, "loss": 0.0518, "num_tokens": 41863855.0, "step": 549 }, { "epoch": 0.6862133499688085, "grad_norm": 0.1326421204130351, "learning_rate": 9.106805046281127e-06, "loss": 0.0482, "num_tokens": 41940301.0, "step": 550 }, { "epoch": 0.6874610106051154, "grad_norm": 0.13892809572479203, "learning_rate": 9.103178236686309e-06, "loss": 0.0503, "num_tokens": 42015296.0, "step": 551 }, { "epoch": 0.6887086712414223, "grad_norm": 0.138296019206815, "learning_rate": 9.099544893440265e-06, "loss": 0.0477, "num_tokens": 42090560.0, "step": 552 }, { "epoch": 0.6899563318777293, "grad_norm": 0.13688585206848972, "learning_rate": 9.095905023131337e-06, "loss": 0.051, "num_tokens": 42166613.0, "step": 553 }, { "epoch": 0.6912039925140362, "grad_norm": 0.14866506305162613, "learning_rate": 9.092258632359714e-06, "loss": 0.0518, "num_tokens": 42243060.0, "step": 554 }, { "epoch": 0.6924516531503431, "grad_norm": 0.13540311621936235, "learning_rate": 9.088605727737405e-06, "loss": 0.0499, "num_tokens": 42319484.0, "step": 555 }, { "epoch": 0.6936993137866501, "grad_norm": 0.13923746025516218, "learning_rate": 9.08494631588823e-06, "loss": 0.0517, "num_tokens": 42396278.0, "step": 556 }, { "epoch": 0.6949469744229569, "grad_norm": 0.1305168808594344, "learning_rate": 9.08128040344781e-06, "loss": 0.0461, "num_tokens": 42471074.0, "step": 557 }, { "epoch": 0.6961946350592639, "grad_norm": 0.13963165538464697, "learning_rate": 9.077607997063546e-06, "loss": 0.0496, "num_tokens": 42546608.0, "step": 558 }, { "epoch": 0.6974422956955708, "grad_norm": 0.14171018648487066, "learning_rate": 9.073929103394627e-06, "loss": 0.0511, "num_tokens": 42622791.0, "step": 559 }, { "epoch": 0.6986899563318777, "grad_norm": 0.1482616284034211, "learning_rate": 9.070243729111998e-06, "loss": 0.0536, "num_tokens": 42698266.0, "step": 560 }, { "epoch": 0.6999376169681847, "grad_norm": 0.1389694479982638, "learning_rate": 9.066551880898356e-06, "loss": 0.0502, "num_tokens": 42775035.0, "step": 561 }, { "epoch": 0.7011852776044916, "grad_norm": 0.14647295565403837, "learning_rate": 9.062853565448137e-06, "loss": 0.0508, "num_tokens": 42851775.0, "step": 562 }, { "epoch": 0.7024329382407986, "grad_norm": 0.14191889661269946, "learning_rate": 9.059148789467508e-06, "loss": 0.0531, "num_tokens": 42928574.0, "step": 563 }, { "epoch": 0.7036805988771054, "grad_norm": 0.1483489925428368, "learning_rate": 9.055437559674343e-06, "loss": 0.0518, "num_tokens": 43005534.0, "step": 564 }, { "epoch": 0.7049282595134123, "grad_norm": 0.14255136898330478, "learning_rate": 9.051719882798226e-06, "loss": 0.049, "num_tokens": 43081297.0, "step": 565 }, { "epoch": 0.7061759201497193, "grad_norm": 0.13615790668479794, "learning_rate": 9.047995765580428e-06, "loss": 0.0474, "num_tokens": 43157741.0, "step": 566 }, { "epoch": 0.7074235807860262, "grad_norm": 0.1348869319148483, "learning_rate": 9.044265214773901e-06, "loss": 0.0487, "num_tokens": 43234575.0, "step": 567 }, { "epoch": 0.7086712414223332, "grad_norm": 0.1345129640129995, "learning_rate": 9.040528237143258e-06, "loss": 0.0519, "num_tokens": 43311435.0, "step": 568 }, { "epoch": 0.7099189020586401, "grad_norm": 0.1424294334295593, "learning_rate": 9.036784839464771e-06, "loss": 0.0507, "num_tokens": 43388039.0, "step": 569 }, { "epoch": 0.7111665626949469, "grad_norm": 0.13921104210577145, "learning_rate": 9.033035028526352e-06, "loss": 0.0478, "num_tokens": 43463609.0, "step": 570 }, { "epoch": 0.7124142233312539, "grad_norm": 0.13815837263264233, "learning_rate": 9.029278811127539e-06, "loss": 0.0482, "num_tokens": 43540358.0, "step": 571 }, { "epoch": 0.7136618839675608, "grad_norm": 0.14661609406068885, "learning_rate": 9.025516194079493e-06, "loss": 0.0527, "num_tokens": 43616703.0, "step": 572 }, { "epoch": 0.7149095446038678, "grad_norm": 0.1420210259390237, "learning_rate": 9.021747184204974e-06, "loss": 0.0482, "num_tokens": 43692473.0, "step": 573 }, { "epoch": 0.7161572052401747, "grad_norm": 0.15807055160763936, "learning_rate": 9.017971788338338e-06, "loss": 0.0572, "num_tokens": 43770031.0, "step": 574 }, { "epoch": 0.7174048658764816, "grad_norm": 0.14478525891433547, "learning_rate": 9.014190013325514e-06, "loss": 0.0516, "num_tokens": 43846662.0, "step": 575 }, { "epoch": 0.7186525265127885, "grad_norm": 0.14868363097251575, "learning_rate": 9.010401866024007e-06, "loss": 0.0492, "num_tokens": 43922348.0, "step": 576 }, { "epoch": 0.7199001871490954, "grad_norm": 0.13523798397269673, "learning_rate": 9.006607353302874e-06, "loss": 0.0478, "num_tokens": 43997815.0, "step": 577 }, { "epoch": 0.7211478477854024, "grad_norm": 0.1322796061870224, "learning_rate": 9.00280648204271e-06, "loss": 0.0449, "num_tokens": 44074133.0, "step": 578 }, { "epoch": 0.7223955084217093, "grad_norm": 0.13851692878820665, "learning_rate": 8.998999259135648e-06, "loss": 0.0474, "num_tokens": 44148719.0, "step": 579 }, { "epoch": 0.7236431690580162, "grad_norm": 0.17724864437281473, "learning_rate": 8.99518569148533e-06, "loss": 0.0513, "num_tokens": 44224599.0, "step": 580 }, { "epoch": 0.7248908296943232, "grad_norm": 0.15309429572852976, "learning_rate": 8.991365786006908e-06, "loss": 0.0529, "num_tokens": 44300530.0, "step": 581 }, { "epoch": 0.72613849033063, "grad_norm": 0.14474973461404464, "learning_rate": 8.987539549627026e-06, "loss": 0.0526, "num_tokens": 44377954.0, "step": 582 }, { "epoch": 0.727386150966937, "grad_norm": 0.14258419787145118, "learning_rate": 8.983706989283804e-06, "loss": 0.0474, "num_tokens": 44453567.0, "step": 583 }, { "epoch": 0.7286338116032439, "grad_norm": 0.13275146203069824, "learning_rate": 8.979868111926836e-06, "loss": 0.0471, "num_tokens": 44529975.0, "step": 584 }, { "epoch": 0.7298814722395508, "grad_norm": 0.14399620167955793, "learning_rate": 8.976022924517167e-06, "loss": 0.0516, "num_tokens": 44607134.0, "step": 585 }, { "epoch": 0.7311291328758578, "grad_norm": 0.14627346228999338, "learning_rate": 8.972171434027283e-06, "loss": 0.0519, "num_tokens": 44683151.0, "step": 586 }, { "epoch": 0.7323767935121647, "grad_norm": 0.14811956571593604, "learning_rate": 8.968313647441098e-06, "loss": 0.055, "num_tokens": 44759260.0, "step": 587 }, { "epoch": 0.7336244541484717, "grad_norm": 0.13835530298124146, "learning_rate": 8.964449571753949e-06, "loss": 0.05, "num_tokens": 44835552.0, "step": 588 }, { "epoch": 0.7348721147847785, "grad_norm": 0.1341138944001767, "learning_rate": 8.96057921397257e-06, "loss": 0.0504, "num_tokens": 44912366.0, "step": 589 }, { "epoch": 0.7361197754210854, "grad_norm": 0.1495197595834141, "learning_rate": 8.95670258111509e-06, "loss": 0.053, "num_tokens": 44990082.0, "step": 590 }, { "epoch": 0.7373674360573924, "grad_norm": 0.14011781606379067, "learning_rate": 8.95281968021102e-06, "loss": 0.0481, "num_tokens": 45065731.0, "step": 591 }, { "epoch": 0.7386150966936993, "grad_norm": 0.14888004483940512, "learning_rate": 8.948930518301228e-06, "loss": 0.0546, "num_tokens": 45143212.0, "step": 592 }, { "epoch": 0.7398627573300063, "grad_norm": 0.1434778875987488, "learning_rate": 8.945035102437943e-06, "loss": 0.0533, "num_tokens": 45219875.0, "step": 593 }, { "epoch": 0.7411104179663132, "grad_norm": 0.1382641241537832, "learning_rate": 8.94113343968473e-06, "loss": 0.0525, "num_tokens": 45297056.0, "step": 594 }, { "epoch": 0.74235807860262, "grad_norm": 0.14167216666261945, "learning_rate": 8.937225537116482e-06, "loss": 0.0537, "num_tokens": 45374106.0, "step": 595 }, { "epoch": 0.743605739238927, "grad_norm": 0.1438734355396013, "learning_rate": 8.93331140181941e-06, "loss": 0.0524, "num_tokens": 45449504.0, "step": 596 }, { "epoch": 0.7448533998752339, "grad_norm": 0.14488167117333944, "learning_rate": 8.929391040891022e-06, "loss": 0.0493, "num_tokens": 45526160.0, "step": 597 }, { "epoch": 0.7461010605115409, "grad_norm": 0.13888422862308172, "learning_rate": 8.92546446144012e-06, "loss": 0.0462, "num_tokens": 45601707.0, "step": 598 }, { "epoch": 0.7473487211478478, "grad_norm": 0.14288450463132527, "learning_rate": 8.921531670586778e-06, "loss": 0.0582, "num_tokens": 45678093.0, "step": 599 }, { "epoch": 0.7485963817841547, "grad_norm": 0.134670825844891, "learning_rate": 8.917592675462333e-06, "loss": 0.05, "num_tokens": 45754558.0, "step": 600 }, { "epoch": 0.7498440424204617, "grad_norm": 0.136217251833068, "learning_rate": 8.913647483209376e-06, "loss": 0.0477, "num_tokens": 45830631.0, "step": 601 }, { "epoch": 0.7510917030567685, "grad_norm": 0.1452373399840719, "learning_rate": 8.909696100981734e-06, "loss": 0.0543, "num_tokens": 45908434.0, "step": 602 }, { "epoch": 0.7523393636930755, "grad_norm": 0.14921298014946266, "learning_rate": 8.905738535944453e-06, "loss": 0.0485, "num_tokens": 45984136.0, "step": 603 }, { "epoch": 0.7535870243293824, "grad_norm": 0.14271019380781036, "learning_rate": 8.901774795273799e-06, "loss": 0.0503, "num_tokens": 46060807.0, "step": 604 }, { "epoch": 0.7548346849656893, "grad_norm": 0.1374279559284211, "learning_rate": 8.897804886157229e-06, "loss": 0.0487, "num_tokens": 46136263.0, "step": 605 }, { "epoch": 0.7560823456019963, "grad_norm": 0.1275634793402176, "learning_rate": 8.893828815793389e-06, "loss": 0.0481, "num_tokens": 46212176.0, "step": 606 }, { "epoch": 0.7573300062383032, "grad_norm": 0.13808428299635023, "learning_rate": 8.889846591392097e-06, "loss": 0.0485, "num_tokens": 46287974.0, "step": 607 }, { "epoch": 0.7585776668746101, "grad_norm": 0.12657074289437703, "learning_rate": 8.88585822017433e-06, "loss": 0.048, "num_tokens": 46364270.0, "step": 608 }, { "epoch": 0.759825327510917, "grad_norm": 0.1286073078080098, "learning_rate": 8.881863709372207e-06, "loss": 0.045, "num_tokens": 46441185.0, "step": 609 }, { "epoch": 0.7610729881472239, "grad_norm": 0.15397840232569265, "learning_rate": 8.877863066228987e-06, "loss": 0.0463, "num_tokens": 46516606.0, "step": 610 }, { "epoch": 0.7623206487835309, "grad_norm": 0.14476818404924205, "learning_rate": 8.873856297999045e-06, "loss": 0.0493, "num_tokens": 46592201.0, "step": 611 }, { "epoch": 0.7635683094198378, "grad_norm": 0.1405957000571124, "learning_rate": 8.869843411947862e-06, "loss": 0.0476, "num_tokens": 46668676.0, "step": 612 }, { "epoch": 0.7648159700561448, "grad_norm": 0.13555365965845964, "learning_rate": 8.865824415352014e-06, "loss": 0.0478, "num_tokens": 46744569.0, "step": 613 }, { "epoch": 0.7660636306924516, "grad_norm": 0.1469494770314817, "learning_rate": 8.861799315499157e-06, "loss": 0.0478, "num_tokens": 46820151.0, "step": 614 }, { "epoch": 0.7673112913287585, "grad_norm": 0.12973149282283644, "learning_rate": 8.85776811968801e-06, "loss": 0.0473, "num_tokens": 46895382.0, "step": 615 }, { "epoch": 0.7685589519650655, "grad_norm": 0.1378044671519852, "learning_rate": 8.853730835228354e-06, "loss": 0.0531, "num_tokens": 46971400.0, "step": 616 }, { "epoch": 0.7698066126013724, "grad_norm": 0.14644478526342583, "learning_rate": 8.849687469441003e-06, "loss": 0.0531, "num_tokens": 47048651.0, "step": 617 }, { "epoch": 0.7710542732376794, "grad_norm": 0.14707076641241534, "learning_rate": 8.845638029657804e-06, "loss": 0.0468, "num_tokens": 47124698.0, "step": 618 }, { "epoch": 0.7723019338739863, "grad_norm": 0.1314006053285289, "learning_rate": 8.841582523221614e-06, "loss": 0.0517, "num_tokens": 47201140.0, "step": 619 }, { "epoch": 0.7735495945102931, "grad_norm": 0.14004488885132432, "learning_rate": 8.83752095748629e-06, "loss": 0.0489, "num_tokens": 47277704.0, "step": 620 }, { "epoch": 0.7747972551466001, "grad_norm": 0.1468794488076061, "learning_rate": 8.833453339816682e-06, "loss": 0.0501, "num_tokens": 47353136.0, "step": 621 }, { "epoch": 0.776044915782907, "grad_norm": 0.13530186455081034, "learning_rate": 8.829379677588607e-06, "loss": 0.0451, "num_tokens": 47428571.0, "step": 622 }, { "epoch": 0.777292576419214, "grad_norm": 0.13276087320331062, "learning_rate": 8.825299978188847e-06, "loss": 0.0468, "num_tokens": 47504497.0, "step": 623 }, { "epoch": 0.7785402370555209, "grad_norm": 0.14105954588118347, "learning_rate": 8.821214249015133e-06, "loss": 0.052, "num_tokens": 47580142.0, "step": 624 }, { "epoch": 0.7797878976918278, "grad_norm": 0.13687964960635024, "learning_rate": 8.817122497476122e-06, "loss": 0.0463, "num_tokens": 47655481.0, "step": 625 }, { "epoch": 0.7810355583281348, "grad_norm": 0.12988897107395347, "learning_rate": 8.8130247309914e-06, "loss": 0.0457, "num_tokens": 47730729.0, "step": 626 }, { "epoch": 0.7822832189644416, "grad_norm": 0.14026915526349465, "learning_rate": 8.808920956991455e-06, "loss": 0.051, "num_tokens": 47807189.0, "step": 627 }, { "epoch": 0.7835308796007486, "grad_norm": 0.142295896643215, "learning_rate": 8.80481118291767e-06, "loss": 0.0487, "num_tokens": 47882622.0, "step": 628 }, { "epoch": 0.7847785402370555, "grad_norm": 0.1370740658768669, "learning_rate": 8.800695416222305e-06, "loss": 0.0495, "num_tokens": 47958319.0, "step": 629 }, { "epoch": 0.7860262008733624, "grad_norm": 0.13452523773803834, "learning_rate": 8.796573664368492e-06, "loss": 0.0483, "num_tokens": 48034149.0, "step": 630 }, { "epoch": 0.7872738615096694, "grad_norm": 0.14079938056246274, "learning_rate": 8.792445934830215e-06, "loss": 0.0478, "num_tokens": 48110857.0, "step": 631 }, { "epoch": 0.7885215221459763, "grad_norm": 0.14941160855473518, "learning_rate": 8.78831223509229e-06, "loss": 0.0509, "num_tokens": 48186687.0, "step": 632 }, { "epoch": 0.7897691827822833, "grad_norm": 0.12938950186652354, "learning_rate": 8.784172572650366e-06, "loss": 0.0462, "num_tokens": 48262209.0, "step": 633 }, { "epoch": 0.7910168434185901, "grad_norm": 0.13822393920620288, "learning_rate": 8.780026955010903e-06, "loss": 0.0499, "num_tokens": 48338358.0, "step": 634 }, { "epoch": 0.7922645040548971, "grad_norm": 0.14421310117588615, "learning_rate": 8.77587538969116e-06, "loss": 0.0492, "num_tokens": 48414450.0, "step": 635 }, { "epoch": 0.793512164691204, "grad_norm": 0.15091469224096057, "learning_rate": 8.771717884219177e-06, "loss": 0.0499, "num_tokens": 48490141.0, "step": 636 }, { "epoch": 0.7947598253275109, "grad_norm": 0.14201341470304255, "learning_rate": 8.767554446133771e-06, "loss": 0.0529, "num_tokens": 48566985.0, "step": 637 }, { "epoch": 0.7960074859638179, "grad_norm": 0.1286122639124437, "learning_rate": 8.763385082984511e-06, "loss": 0.0448, "num_tokens": 48641855.0, "step": 638 }, { "epoch": 0.7972551466001248, "grad_norm": 0.1548475688653674, "learning_rate": 8.759209802331714e-06, "loss": 0.0514, "num_tokens": 48717988.0, "step": 639 }, { "epoch": 0.7985028072364317, "grad_norm": 0.12966093282997393, "learning_rate": 8.755028611746426e-06, "loss": 0.0476, "num_tokens": 48794607.0, "step": 640 }, { "epoch": 0.7997504678727386, "grad_norm": 0.13188968216215208, "learning_rate": 8.750841518810407e-06, "loss": 0.0481, "num_tokens": 48871059.0, "step": 641 }, { "epoch": 0.8009981285090455, "grad_norm": 0.13664684497120977, "learning_rate": 8.746648531116126e-06, "loss": 0.0446, "num_tokens": 48948505.0, "step": 642 }, { "epoch": 0.8022457891453525, "grad_norm": 0.14317884582014548, "learning_rate": 8.742449656266733e-06, "loss": 0.0504, "num_tokens": 49024812.0, "step": 643 }, { "epoch": 0.8034934497816594, "grad_norm": 0.1347554523074578, "learning_rate": 8.738244901876061e-06, "loss": 0.0473, "num_tokens": 49101044.0, "step": 644 }, { "epoch": 0.8047411104179664, "grad_norm": 0.13636883319815177, "learning_rate": 8.7340342755686e-06, "loss": 0.0467, "num_tokens": 49177145.0, "step": 645 }, { "epoch": 0.8059887710542732, "grad_norm": 0.1275910557926321, "learning_rate": 8.729817784979485e-06, "loss": 0.0495, "num_tokens": 49254377.0, "step": 646 }, { "epoch": 0.8072364316905801, "grad_norm": 0.14394843837797267, "learning_rate": 8.725595437754489e-06, "loss": 0.0519, "num_tokens": 49330881.0, "step": 647 }, { "epoch": 0.8084840923268871, "grad_norm": 0.1417831430199472, "learning_rate": 8.721367241550007e-06, "loss": 0.0477, "num_tokens": 49406788.0, "step": 648 }, { "epoch": 0.809731752963194, "grad_norm": 0.12920116740596488, "learning_rate": 8.717133204033034e-06, "loss": 0.0455, "num_tokens": 49482606.0, "step": 649 }, { "epoch": 0.810979413599501, "grad_norm": 0.14287675644499612, "learning_rate": 8.71289333288116e-06, "loss": 0.0498, "num_tokens": 49559132.0, "step": 650 }, { "epoch": 0.8122270742358079, "grad_norm": 0.14254507159222335, "learning_rate": 8.708647635782553e-06, "loss": 0.0484, "num_tokens": 49635261.0, "step": 651 }, { "epoch": 0.8134747348721147, "grad_norm": 0.1419100200993809, "learning_rate": 8.704396120435944e-06, "loss": 0.0537, "num_tokens": 49712659.0, "step": 652 }, { "epoch": 0.8147223955084217, "grad_norm": 0.12941282820277408, "learning_rate": 8.700138794550617e-06, "loss": 0.0491, "num_tokens": 49788525.0, "step": 653 }, { "epoch": 0.8159700561447286, "grad_norm": 0.13450455701515368, "learning_rate": 8.695875665846392e-06, "loss": 0.0497, "num_tokens": 49864781.0, "step": 654 }, { "epoch": 0.8172177167810356, "grad_norm": 0.13125571555637372, "learning_rate": 8.691606742053608e-06, "loss": 0.0479, "num_tokens": 49941093.0, "step": 655 }, { "epoch": 0.8184653774173425, "grad_norm": 0.14213099235813234, "learning_rate": 8.687332030913114e-06, "loss": 0.0467, "num_tokens": 50016325.0, "step": 656 }, { "epoch": 0.8197130380536494, "grad_norm": 0.1390710924527652, "learning_rate": 8.683051540176252e-06, "loss": 0.0495, "num_tokens": 50091945.0, "step": 657 }, { "epoch": 0.8209606986899564, "grad_norm": 0.12939775374003082, "learning_rate": 8.67876527760485e-06, "loss": 0.0452, "num_tokens": 50166912.0, "step": 658 }, { "epoch": 0.8222083593262632, "grad_norm": 0.1442490674566971, "learning_rate": 8.674473250971194e-06, "loss": 0.0495, "num_tokens": 50242363.0, "step": 659 }, { "epoch": 0.8234560199625702, "grad_norm": 0.12532738571426058, "learning_rate": 8.670175468058027e-06, "loss": 0.0455, "num_tokens": 50318669.0, "step": 660 }, { "epoch": 0.8247036805988771, "grad_norm": 0.1482963807294505, "learning_rate": 8.665871936658525e-06, "loss": 0.0515, "num_tokens": 50395068.0, "step": 661 }, { "epoch": 0.825951341235184, "grad_norm": 0.14537684974160384, "learning_rate": 8.661562664576297e-06, "loss": 0.0502, "num_tokens": 50471770.0, "step": 662 }, { "epoch": 0.827199001871491, "grad_norm": 0.14127973389090612, "learning_rate": 8.65724765962535e-06, "loss": 0.0512, "num_tokens": 50548248.0, "step": 663 }, { "epoch": 0.8284466625077979, "grad_norm": 0.13467898135586287, "learning_rate": 8.652926929630097e-06, "loss": 0.0455, "num_tokens": 50623214.0, "step": 664 }, { "epoch": 0.8296943231441049, "grad_norm": 0.1381037134217272, "learning_rate": 8.648600482425325e-06, "loss": 0.051, "num_tokens": 50699609.0, "step": 665 }, { "epoch": 0.8309419837804117, "grad_norm": 0.14193032487419316, "learning_rate": 8.644268325856193e-06, "loss": 0.0564, "num_tokens": 50778673.0, "step": 666 }, { "epoch": 0.8321896444167186, "grad_norm": 0.13611323916808238, "learning_rate": 8.639930467778206e-06, "loss": 0.0521, "num_tokens": 50855609.0, "step": 667 }, { "epoch": 0.8334373050530256, "grad_norm": 0.13531201611201343, "learning_rate": 8.635586916057214e-06, "loss": 0.0506, "num_tokens": 50931303.0, "step": 668 }, { "epoch": 0.8346849656893325, "grad_norm": 0.1303221088261824, "learning_rate": 8.631237678569391e-06, "loss": 0.0441, "num_tokens": 51006600.0, "step": 669 }, { "epoch": 0.8359326263256395, "grad_norm": 0.13622877089659338, "learning_rate": 8.626882763201215e-06, "loss": 0.0484, "num_tokens": 51082648.0, "step": 670 }, { "epoch": 0.8371802869619464, "grad_norm": 0.13249433552081497, "learning_rate": 8.62252217784947e-06, "loss": 0.0462, "num_tokens": 51158853.0, "step": 671 }, { "epoch": 0.8384279475982532, "grad_norm": 0.128012742550011, "learning_rate": 8.61815593042121e-06, "loss": 0.0465, "num_tokens": 51235008.0, "step": 672 }, { "epoch": 0.8396756082345602, "grad_norm": 0.13676039074631796, "learning_rate": 8.61378402883376e-06, "loss": 0.0484, "num_tokens": 51312341.0, "step": 673 }, { "epoch": 0.8409232688708671, "grad_norm": 0.13896442644172538, "learning_rate": 8.609406481014704e-06, "loss": 0.0538, "num_tokens": 51389560.0, "step": 674 }, { "epoch": 0.8421709295071741, "grad_norm": 0.13178732543595786, "learning_rate": 8.605023294901857e-06, "loss": 0.0492, "num_tokens": 51467043.0, "step": 675 }, { "epoch": 0.843418590143481, "grad_norm": 0.13530364899028144, "learning_rate": 8.600634478443262e-06, "loss": 0.0504, "num_tokens": 51544362.0, "step": 676 }, { "epoch": 0.8446662507797879, "grad_norm": 0.14451569461184943, "learning_rate": 8.596240039597168e-06, "loss": 0.049, "num_tokens": 51620148.0, "step": 677 }, { "epoch": 0.8459139114160948, "grad_norm": 0.12951759840418117, "learning_rate": 8.59183998633202e-06, "loss": 0.0474, "num_tokens": 51696229.0, "step": 678 }, { "epoch": 0.8471615720524017, "grad_norm": 0.13526546179585314, "learning_rate": 8.587434326626446e-06, "loss": 0.0484, "num_tokens": 51772533.0, "step": 679 }, { "epoch": 0.8484092326887087, "grad_norm": 0.12302067491500918, "learning_rate": 8.58302306846924e-06, "loss": 0.043, "num_tokens": 51848768.0, "step": 680 }, { "epoch": 0.8496568933250156, "grad_norm": 0.1457279157996279, "learning_rate": 8.57860621985934e-06, "loss": 0.0487, "num_tokens": 51925423.0, "step": 681 }, { "epoch": 0.8509045539613225, "grad_norm": 0.13073888393774447, "learning_rate": 8.574183788805838e-06, "loss": 0.0437, "num_tokens": 52001711.0, "step": 682 }, { "epoch": 0.8521522145976295, "grad_norm": 0.1416231531180916, "learning_rate": 8.56975578332793e-06, "loss": 0.0487, "num_tokens": 52078103.0, "step": 683 }, { "epoch": 0.8533998752339363, "grad_norm": 0.1443155543381733, "learning_rate": 8.56532221145493e-06, "loss": 0.0474, "num_tokens": 52154193.0, "step": 684 }, { "epoch": 0.8546475358702433, "grad_norm": 0.13662087765610023, "learning_rate": 8.560883081226246e-06, "loss": 0.0471, "num_tokens": 52230298.0, "step": 685 }, { "epoch": 0.8558951965065502, "grad_norm": 0.12505249076114922, "learning_rate": 8.55643840069136e-06, "loss": 0.0467, "num_tokens": 52306515.0, "step": 686 }, { "epoch": 0.8571428571428571, "grad_norm": 0.1310843454403954, "learning_rate": 8.551988177909825e-06, "loss": 0.0489, "num_tokens": 52383062.0, "step": 687 }, { "epoch": 0.8583905177791641, "grad_norm": 0.1320466168637091, "learning_rate": 8.547532420951236e-06, "loss": 0.0442, "num_tokens": 52458704.0, "step": 688 }, { "epoch": 0.859638178415471, "grad_norm": 0.13717646106028553, "learning_rate": 8.543071137895231e-06, "loss": 0.0489, "num_tokens": 52536401.0, "step": 689 }, { "epoch": 0.860885839051778, "grad_norm": 0.13096083678794693, "learning_rate": 8.538604336831463e-06, "loss": 0.0477, "num_tokens": 52612783.0, "step": 690 }, { "epoch": 0.8621334996880848, "grad_norm": 0.13066288819408423, "learning_rate": 8.53413202585959e-06, "loss": 0.0484, "num_tokens": 52688848.0, "step": 691 }, { "epoch": 0.8633811603243917, "grad_norm": 0.13458486228791905, "learning_rate": 8.529654213089266e-06, "loss": 0.0463, "num_tokens": 52765192.0, "step": 692 }, { "epoch": 0.8646288209606987, "grad_norm": 0.12641960186422305, "learning_rate": 8.52517090664012e-06, "loss": 0.0433, "num_tokens": 52840184.0, "step": 693 }, { "epoch": 0.8658764815970056, "grad_norm": 0.139222442092495, "learning_rate": 8.520682114641739e-06, "loss": 0.0485, "num_tokens": 52915968.0, "step": 694 }, { "epoch": 0.8671241422333126, "grad_norm": 0.1310804758808409, "learning_rate": 8.51618784523366e-06, "loss": 0.0459, "num_tokens": 52991860.0, "step": 695 }, { "epoch": 0.8683718028696195, "grad_norm": 0.14181865718671147, "learning_rate": 8.511688106565356e-06, "loss": 0.0488, "num_tokens": 53068597.0, "step": 696 }, { "epoch": 0.8696194635059263, "grad_norm": 0.13954957893418474, "learning_rate": 8.507182906796209e-06, "loss": 0.0537, "num_tokens": 53146323.0, "step": 697 }, { "epoch": 0.8708671241422333, "grad_norm": 0.13072333860570592, "learning_rate": 8.50267225409551e-06, "loss": 0.0439, "num_tokens": 53221723.0, "step": 698 }, { "epoch": 0.8721147847785402, "grad_norm": 0.13924135218825956, "learning_rate": 8.498156156642434e-06, "loss": 0.0485, "num_tokens": 53297446.0, "step": 699 }, { "epoch": 0.8733624454148472, "grad_norm": 0.1277280295683365, "learning_rate": 8.493634622626031e-06, "loss": 0.0446, "num_tokens": 53373744.0, "step": 700 }, { "epoch": 0.8746101060511541, "grad_norm": 0.13143765975583496, "learning_rate": 8.489107660245208e-06, "loss": 0.0422, "num_tokens": 53448470.0, "step": 701 }, { "epoch": 0.875857766687461, "grad_norm": 0.13289670713727797, "learning_rate": 8.484575277708718e-06, "loss": 0.0478, "num_tokens": 53526159.0, "step": 702 }, { "epoch": 0.877105427323768, "grad_norm": 0.14509962938350404, "learning_rate": 8.480037483235142e-06, "loss": 0.0475, "num_tokens": 53602483.0, "step": 703 }, { "epoch": 0.8783530879600748, "grad_norm": 0.1282228253361154, "learning_rate": 8.475494285052873e-06, "loss": 0.0497, "num_tokens": 53679362.0, "step": 704 }, { "epoch": 0.8796007485963818, "grad_norm": 0.12663678209388982, "learning_rate": 8.470945691400095e-06, "loss": 0.0455, "num_tokens": 53756359.0, "step": 705 }, { "epoch": 0.8808484092326887, "grad_norm": 0.1387150438279793, "learning_rate": 8.466391710524792e-06, "loss": 0.0486, "num_tokens": 53832649.0, "step": 706 }, { "epoch": 0.8820960698689956, "grad_norm": 0.13224121719631327, "learning_rate": 8.461832350684701e-06, "loss": 0.0468, "num_tokens": 53908486.0, "step": 707 }, { "epoch": 0.8833437305053026, "grad_norm": 0.12567190952105986, "learning_rate": 8.457267620147326e-06, "loss": 0.047, "num_tokens": 53984289.0, "step": 708 }, { "epoch": 0.8845913911416095, "grad_norm": 0.11477635822823083, "learning_rate": 8.452697527189901e-06, "loss": 0.0428, "num_tokens": 54060920.0, "step": 709 }, { "epoch": 0.8858390517779164, "grad_norm": 0.12985982634867996, "learning_rate": 8.448122080099384e-06, "loss": 0.0506, "num_tokens": 54137223.0, "step": 710 }, { "epoch": 0.8870867124142233, "grad_norm": 0.1303891811928819, "learning_rate": 8.443541287172443e-06, "loss": 0.0484, "num_tokens": 54214088.0, "step": 711 }, { "epoch": 0.8883343730505302, "grad_norm": 0.12282561290472704, "learning_rate": 8.438955156715443e-06, "loss": 0.0419, "num_tokens": 54289595.0, "step": 712 }, { "epoch": 0.8895820336868372, "grad_norm": 0.13256356712192674, "learning_rate": 8.434363697044423e-06, "loss": 0.0474, "num_tokens": 54365777.0, "step": 713 }, { "epoch": 0.8908296943231441, "grad_norm": 0.1295150884171825, "learning_rate": 8.429766916485087e-06, "loss": 0.0482, "num_tokens": 54442444.0, "step": 714 }, { "epoch": 0.8920773549594511, "grad_norm": 0.1354661630576582, "learning_rate": 8.42516482337279e-06, "loss": 0.0474, "num_tokens": 54517968.0, "step": 715 }, { "epoch": 0.8933250155957579, "grad_norm": 0.12661066471963617, "learning_rate": 8.420557426052513e-06, "loss": 0.0477, "num_tokens": 54594152.0, "step": 716 }, { "epoch": 0.8945726762320648, "grad_norm": 0.13003061445709582, "learning_rate": 8.415944732878863e-06, "loss": 0.0462, "num_tokens": 54670044.0, "step": 717 }, { "epoch": 0.8958203368683718, "grad_norm": 0.1347890412511373, "learning_rate": 8.411326752216048e-06, "loss": 0.0477, "num_tokens": 54745643.0, "step": 718 }, { "epoch": 0.8970679975046787, "grad_norm": 0.13806299761156438, "learning_rate": 8.406703492437863e-06, "loss": 0.0462, "num_tokens": 54820709.0, "step": 719 }, { "epoch": 0.8983156581409857, "grad_norm": 0.15064441560708958, "learning_rate": 8.402074961927674e-06, "loss": 0.0506, "num_tokens": 54896955.0, "step": 720 }, { "epoch": 0.8995633187772926, "grad_norm": 0.12826416085523926, "learning_rate": 8.397441169078404e-06, "loss": 0.0468, "num_tokens": 54974212.0, "step": 721 }, { "epoch": 0.9008109794135996, "grad_norm": 0.12699295137628364, "learning_rate": 8.392802122292522e-06, "loss": 0.0451, "num_tokens": 55050113.0, "step": 722 }, { "epoch": 0.9020586400499064, "grad_norm": 0.1401057811483236, "learning_rate": 8.388157829982023e-06, "loss": 0.045, "num_tokens": 55126297.0, "step": 723 }, { "epoch": 0.9033063006862133, "grad_norm": 0.13188033083997214, "learning_rate": 8.383508300568409e-06, "loss": 0.0527, "num_tokens": 55203060.0, "step": 724 }, { "epoch": 0.9045539613225203, "grad_norm": 0.1272260308340507, "learning_rate": 8.378853542482687e-06, "loss": 0.0444, "num_tokens": 55278770.0, "step": 725 }, { "epoch": 0.9058016219588272, "grad_norm": 0.14369524671244313, "learning_rate": 8.374193564165338e-06, "loss": 0.0444, "num_tokens": 55354950.0, "step": 726 }, { "epoch": 0.9070492825951342, "grad_norm": 0.13648519325941133, "learning_rate": 8.36952837406631e-06, "loss": 0.0473, "num_tokens": 55431358.0, "step": 727 }, { "epoch": 0.9082969432314411, "grad_norm": 0.12922616460751848, "learning_rate": 8.364857980645006e-06, "loss": 0.0454, "num_tokens": 55507747.0, "step": 728 }, { "epoch": 0.9095446038677479, "grad_norm": 0.127875634016502, "learning_rate": 8.360182392370258e-06, "loss": 0.0481, "num_tokens": 55585197.0, "step": 729 }, { "epoch": 0.9107922645040549, "grad_norm": 0.13104378719838006, "learning_rate": 8.355501617720321e-06, "loss": 0.0439, "num_tokens": 55660574.0, "step": 730 }, { "epoch": 0.9120399251403618, "grad_norm": 0.12926845838420958, "learning_rate": 8.350815665182855e-06, "loss": 0.0441, "num_tokens": 55736377.0, "step": 731 }, { "epoch": 0.9132875857766688, "grad_norm": 0.1270523993265075, "learning_rate": 8.34612454325491e-06, "loss": 0.0471, "num_tokens": 55812894.0, "step": 732 }, { "epoch": 0.9145352464129757, "grad_norm": 0.12394117715946282, "learning_rate": 8.341428260442907e-06, "loss": 0.0447, "num_tokens": 55888948.0, "step": 733 }, { "epoch": 0.9157829070492826, "grad_norm": 0.12072645194274476, "learning_rate": 8.336726825262622e-06, "loss": 0.0451, "num_tokens": 55964442.0, "step": 734 }, { "epoch": 0.9170305676855895, "grad_norm": 0.12672993586675507, "learning_rate": 8.332020246239183e-06, "loss": 0.0465, "num_tokens": 56041127.0, "step": 735 }, { "epoch": 0.9182782283218964, "grad_norm": 0.13657796012109533, "learning_rate": 8.327308531907039e-06, "loss": 0.0482, "num_tokens": 56119235.0, "step": 736 }, { "epoch": 0.9195258889582034, "grad_norm": 0.13041755901650912, "learning_rate": 8.322591690809952e-06, "loss": 0.0469, "num_tokens": 56195147.0, "step": 737 }, { "epoch": 0.9207735495945103, "grad_norm": 0.12361588139014848, "learning_rate": 8.317869731500981e-06, "loss": 0.0557, "num_tokens": 56271734.0, "step": 738 }, { "epoch": 0.9220212102308172, "grad_norm": 0.1272077384011452, "learning_rate": 8.313142662542465e-06, "loss": 0.0462, "num_tokens": 56348987.0, "step": 739 }, { "epoch": 0.9232688708671242, "grad_norm": 0.1399474323212219, "learning_rate": 8.30841049250601e-06, "loss": 0.0455, "num_tokens": 56426136.0, "step": 740 }, { "epoch": 0.924516531503431, "grad_norm": 0.13812075909506322, "learning_rate": 8.303673229972468e-06, "loss": 0.0496, "num_tokens": 56503751.0, "step": 741 }, { "epoch": 0.925764192139738, "grad_norm": 0.13599501450992818, "learning_rate": 8.298930883531932e-06, "loss": 0.0524, "num_tokens": 56581417.0, "step": 742 }, { "epoch": 0.9270118527760449, "grad_norm": 0.1325812107893234, "learning_rate": 8.294183461783704e-06, "loss": 0.0471, "num_tokens": 56656845.0, "step": 743 }, { "epoch": 0.9282595134123518, "grad_norm": 0.12712734428173242, "learning_rate": 8.2894309733363e-06, "loss": 0.0454, "num_tokens": 56733635.0, "step": 744 }, { "epoch": 0.9295071740486588, "grad_norm": 0.14005206065485848, "learning_rate": 8.284673426807413e-06, "loss": 0.0463, "num_tokens": 56809680.0, "step": 745 }, { "epoch": 0.9307548346849657, "grad_norm": 0.14087300313657175, "learning_rate": 8.279910830823917e-06, "loss": 0.0459, "num_tokens": 56884969.0, "step": 746 }, { "epoch": 0.9320024953212727, "grad_norm": 0.1352822553422503, "learning_rate": 8.275143194021837e-06, "loss": 0.0426, "num_tokens": 56960412.0, "step": 747 }, { "epoch": 0.9332501559575795, "grad_norm": 0.1248091276939536, "learning_rate": 8.270370525046338e-06, "loss": 0.0442, "num_tokens": 57035943.0, "step": 748 }, { "epoch": 0.9344978165938864, "grad_norm": 0.14141827065829213, "learning_rate": 8.265592832551714e-06, "loss": 0.0499, "num_tokens": 57112296.0, "step": 749 }, { "epoch": 0.9357454772301934, "grad_norm": 0.15528789441951488, "learning_rate": 8.260810125201363e-06, "loss": 0.0523, "num_tokens": 57189546.0, "step": 750 }, { "epoch": 0.9369931378665003, "grad_norm": 0.1444757644146868, "learning_rate": 8.25602241166778e-06, "loss": 0.0477, "num_tokens": 57266306.0, "step": 751 }, { "epoch": 0.9382407985028073, "grad_norm": 0.12551880038959676, "learning_rate": 8.251229700632536e-06, "loss": 0.0459, "num_tokens": 57341686.0, "step": 752 }, { "epoch": 0.9394884591391142, "grad_norm": 0.13610947168319384, "learning_rate": 8.246432000786267e-06, "loss": 0.0466, "num_tokens": 57418946.0, "step": 753 }, { "epoch": 0.940736119775421, "grad_norm": 0.1418391670800807, "learning_rate": 8.241629320828652e-06, "loss": 0.0436, "num_tokens": 57494409.0, "step": 754 }, { "epoch": 0.941983780411728, "grad_norm": 0.14475322007531208, "learning_rate": 8.2368216694684e-06, "loss": 0.0474, "num_tokens": 57572009.0, "step": 755 }, { "epoch": 0.9432314410480349, "grad_norm": 0.13184602873921808, "learning_rate": 8.232009055423236e-06, "loss": 0.0423, "num_tokens": 57646117.0, "step": 756 }, { "epoch": 0.9444791016843419, "grad_norm": 0.12812576418278393, "learning_rate": 8.227191487419887e-06, "loss": 0.044, "num_tokens": 57721560.0, "step": 757 }, { "epoch": 0.9457267623206488, "grad_norm": 0.12654952839318306, "learning_rate": 8.222368974194057e-06, "loss": 0.0461, "num_tokens": 57798213.0, "step": 758 }, { "epoch": 0.9469744229569557, "grad_norm": 0.12934305800781434, "learning_rate": 8.217541524490422e-06, "loss": 0.0478, "num_tokens": 57875229.0, "step": 759 }, { "epoch": 0.9482220835932627, "grad_norm": 0.13295210650526854, "learning_rate": 8.212709147062604e-06, "loss": 0.0444, "num_tokens": 57950833.0, "step": 760 }, { "epoch": 0.9494697442295695, "grad_norm": 0.12300137067417298, "learning_rate": 8.207871850673168e-06, "loss": 0.0465, "num_tokens": 58027155.0, "step": 761 }, { "epoch": 0.9507174048658765, "grad_norm": 0.12846718774897786, "learning_rate": 8.203029644093593e-06, "loss": 0.0485, "num_tokens": 58103747.0, "step": 762 }, { "epoch": 0.9519650655021834, "grad_norm": 0.12166615887497252, "learning_rate": 8.198182536104262e-06, "loss": 0.0444, "num_tokens": 58179264.0, "step": 763 }, { "epoch": 0.9532127261384903, "grad_norm": 0.13359004503185487, "learning_rate": 8.193330535494448e-06, "loss": 0.0459, "num_tokens": 58255206.0, "step": 764 }, { "epoch": 0.9544603867747973, "grad_norm": 0.13069849304221773, "learning_rate": 8.188473651062296e-06, "loss": 0.0455, "num_tokens": 58330290.0, "step": 765 }, { "epoch": 0.9557080474111042, "grad_norm": 0.1376163273913286, "learning_rate": 8.183611891614803e-06, "loss": 0.0443, "num_tokens": 58405956.0, "step": 766 }, { "epoch": 0.9569557080474111, "grad_norm": 0.13686003984330788, "learning_rate": 8.178745265967808e-06, "loss": 0.0464, "num_tokens": 58482233.0, "step": 767 }, { "epoch": 0.958203368683718, "grad_norm": 0.12627060136965676, "learning_rate": 8.173873782945976e-06, "loss": 0.047, "num_tokens": 58559969.0, "step": 768 }, { "epoch": 0.9594510293200249, "grad_norm": 0.12879510165555993, "learning_rate": 8.168997451382778e-06, "loss": 0.0482, "num_tokens": 58635674.0, "step": 769 }, { "epoch": 0.9606986899563319, "grad_norm": 0.1331376647953998, "learning_rate": 8.164116280120478e-06, "loss": 0.0442, "num_tokens": 58712224.0, "step": 770 }, { "epoch": 0.9619463505926388, "grad_norm": 0.13246529667252444, "learning_rate": 8.159230278010113e-06, "loss": 0.0464, "num_tokens": 58788204.0, "step": 771 }, { "epoch": 0.9631940112289458, "grad_norm": 0.13752849551124316, "learning_rate": 8.154339453911483e-06, "loss": 0.0453, "num_tokens": 58864120.0, "step": 772 }, { "epoch": 0.9644416718652526, "grad_norm": 0.118632440035503, "learning_rate": 8.14944381669313e-06, "loss": 0.0401, "num_tokens": 58938337.0, "step": 773 }, { "epoch": 0.9656893325015595, "grad_norm": 0.11478561278311256, "learning_rate": 8.144543375232322e-06, "loss": 0.0442, "num_tokens": 59013332.0, "step": 774 }, { "epoch": 0.9669369931378665, "grad_norm": 0.13782337191599275, "learning_rate": 8.139638138415041e-06, "loss": 0.0462, "num_tokens": 59088997.0, "step": 775 }, { "epoch": 0.9681846537741734, "grad_norm": 0.12839918012353083, "learning_rate": 8.134728115135967e-06, "loss": 0.0484, "num_tokens": 59164665.0, "step": 776 }, { "epoch": 0.9694323144104804, "grad_norm": 0.12847654480275927, "learning_rate": 8.129813314298457e-06, "loss": 0.0469, "num_tokens": 59241148.0, "step": 777 }, { "epoch": 0.9706799750467873, "grad_norm": 0.12255793939033695, "learning_rate": 8.124893744814524e-06, "loss": 0.0471, "num_tokens": 59318320.0, "step": 778 }, { "epoch": 0.9719276356830941, "grad_norm": 0.1254272675633237, "learning_rate": 8.11996941560484e-06, "loss": 0.042, "num_tokens": 59393045.0, "step": 779 }, { "epoch": 0.9731752963194011, "grad_norm": 0.1273761325986967, "learning_rate": 8.115040335598701e-06, "loss": 0.046, "num_tokens": 59472300.0, "step": 780 }, { "epoch": 0.974422956955708, "grad_norm": 0.12602861483850217, "learning_rate": 8.110106513734019e-06, "loss": 0.0429, "num_tokens": 59548496.0, "step": 781 }, { "epoch": 0.975670617592015, "grad_norm": 0.1363028683424439, "learning_rate": 8.105167958957302e-06, "loss": 0.0493, "num_tokens": 59626492.0, "step": 782 }, { "epoch": 0.9769182782283219, "grad_norm": 0.12803093679059996, "learning_rate": 8.100224680223647e-06, "loss": 0.0538, "num_tokens": 59702812.0, "step": 783 }, { "epoch": 0.9781659388646288, "grad_norm": 0.1316321034732895, "learning_rate": 8.09527668649671e-06, "loss": 0.0462, "num_tokens": 59779318.0, "step": 784 }, { "epoch": 0.9794135995009358, "grad_norm": 0.13441963780619764, "learning_rate": 8.090323986748696e-06, "loss": 0.0485, "num_tokens": 59855866.0, "step": 785 }, { "epoch": 0.9806612601372426, "grad_norm": 0.13172648789937602, "learning_rate": 8.085366589960353e-06, "loss": 0.045, "num_tokens": 59933364.0, "step": 786 }, { "epoch": 0.9819089207735496, "grad_norm": 0.13013258573518577, "learning_rate": 8.080404505120936e-06, "loss": 0.0441, "num_tokens": 60009279.0, "step": 787 }, { "epoch": 0.9831565814098565, "grad_norm": 0.1319062787442704, "learning_rate": 8.075437741228205e-06, "loss": 0.0464, "num_tokens": 60085623.0, "step": 788 }, { "epoch": 0.9844042420461634, "grad_norm": 0.1308809048521274, "learning_rate": 8.070466307288404e-06, "loss": 0.0456, "num_tokens": 60161774.0, "step": 789 }, { "epoch": 0.9856519026824704, "grad_norm": 0.12602659288322934, "learning_rate": 8.065490212316245e-06, "loss": 0.0439, "num_tokens": 60238406.0, "step": 790 }, { "epoch": 0.9868995633187773, "grad_norm": 0.13344938845482, "learning_rate": 8.060509465334895e-06, "loss": 0.0473, "num_tokens": 60314716.0, "step": 791 }, { "epoch": 0.9881472239550843, "grad_norm": 0.1357885856390025, "learning_rate": 8.055524075375951e-06, "loss": 0.047, "num_tokens": 60391513.0, "step": 792 }, { "epoch": 0.9893948845913911, "grad_norm": 0.13692794199262653, "learning_rate": 8.050534051479432e-06, "loss": 0.0488, "num_tokens": 60468385.0, "step": 793 }, { "epoch": 0.990642545227698, "grad_norm": 0.1337920475499864, "learning_rate": 8.045539402693759e-06, "loss": 0.0478, "num_tokens": 60545686.0, "step": 794 }, { "epoch": 0.991890205864005, "grad_norm": 0.12891045119563382, "learning_rate": 8.040540138075743e-06, "loss": 0.0484, "num_tokens": 60622263.0, "step": 795 }, { "epoch": 0.9931378665003119, "grad_norm": 0.12475341972299739, "learning_rate": 8.035536266690561e-06, "loss": 0.0453, "num_tokens": 60698072.0, "step": 796 }, { "epoch": 0.9943855271366189, "grad_norm": 0.12557072799605964, "learning_rate": 8.030527797611742e-06, "loss": 0.0473, "num_tokens": 60773991.0, "step": 797 }, { "epoch": 0.9956331877729258, "grad_norm": 0.1326655960570811, "learning_rate": 8.025514739921155e-06, "loss": 0.0471, "num_tokens": 60850174.0, "step": 798 }, { "epoch": 0.9968808484092326, "grad_norm": 0.14443419491894904, "learning_rate": 8.02049710270899e-06, "loss": 0.0487, "num_tokens": 60926616.0, "step": 799 }, { "epoch": 0.9981285090455396, "grad_norm": 0.13979837723347358, "learning_rate": 8.015474895073739e-06, "loss": 0.0489, "num_tokens": 61003563.0, "step": 800 }, { "epoch": 0.9993761696818465, "grad_norm": 0.13287610948560982, "learning_rate": 8.010448126122183e-06, "loss": 0.0519, "num_tokens": 61080948.0, "step": 801 }, { "epoch": 1.0, "grad_norm": 0.13287610948560982, "learning_rate": 8.005416804969374e-06, "loss": 0.0464, "num_tokens": 61119461.0, "step": 802 }, { "epoch": 1.001247660636307, "grad_norm": 0.21428891616070306, "learning_rate": 8.000380940738616e-06, "loss": 0.0394, "num_tokens": 61194181.0, "step": 803 }, { "epoch": 1.0024953212726138, "grad_norm": 0.11463542811424311, "learning_rate": 7.995340542561453e-06, "loss": 0.0382, "num_tokens": 61269860.0, "step": 804 }, { "epoch": 1.0037429819089208, "grad_norm": 0.11296312584424995, "learning_rate": 7.990295619577653e-06, "loss": 0.0393, "num_tokens": 61346977.0, "step": 805 }, { "epoch": 1.0049906425452277, "grad_norm": 0.12142614458207387, "learning_rate": 7.985246180935184e-06, "loss": 0.0426, "num_tokens": 61423778.0, "step": 806 }, { "epoch": 1.0062383031815347, "grad_norm": 0.124283410642958, "learning_rate": 7.980192235790207e-06, "loss": 0.04, "num_tokens": 61500104.0, "step": 807 }, { "epoch": 1.0074859638178415, "grad_norm": 0.11514734690512161, "learning_rate": 7.97513379330705e-06, "loss": 0.0409, "num_tokens": 61577699.0, "step": 808 }, { "epoch": 1.0087336244541485, "grad_norm": 0.12772053062936375, "learning_rate": 7.970070862658198e-06, "loss": 0.0385, "num_tokens": 61654726.0, "step": 809 }, { "epoch": 1.0099812850904555, "grad_norm": 0.1420848581428031, "learning_rate": 7.965003453024273e-06, "loss": 0.0402, "num_tokens": 61731992.0, "step": 810 }, { "epoch": 1.0112289457267623, "grad_norm": 0.12101709378925632, "learning_rate": 7.959931573594025e-06, "loss": 0.0376, "num_tokens": 61809566.0, "step": 811 }, { "epoch": 1.0124766063630692, "grad_norm": 0.12379290639805463, "learning_rate": 7.954855233564301e-06, "loss": 0.0388, "num_tokens": 61886528.0, "step": 812 }, { "epoch": 1.0137242669993762, "grad_norm": 0.1310115238681384, "learning_rate": 7.949774442140043e-06, "loss": 0.0385, "num_tokens": 61962417.0, "step": 813 }, { "epoch": 1.014971927635683, "grad_norm": 0.13401797553444206, "learning_rate": 7.944689208534257e-06, "loss": 0.0383, "num_tokens": 62038796.0, "step": 814 }, { "epoch": 1.01621958827199, "grad_norm": 0.12765079817825717, "learning_rate": 7.939599541968012e-06, "loss": 0.0442, "num_tokens": 62114741.0, "step": 815 }, { "epoch": 1.017467248908297, "grad_norm": 0.1469418814720509, "learning_rate": 7.93450545167041e-06, "loss": 0.0379, "num_tokens": 62191755.0, "step": 816 }, { "epoch": 1.018714909544604, "grad_norm": 0.1369016694262358, "learning_rate": 7.929406946878576e-06, "loss": 0.0369, "num_tokens": 62266358.0, "step": 817 }, { "epoch": 1.0199625701809107, "grad_norm": 0.12983077233787918, "learning_rate": 7.924304036837643e-06, "loss": 0.0387, "num_tokens": 62341490.0, "step": 818 }, { "epoch": 1.0212102308172177, "grad_norm": 0.13162911142976996, "learning_rate": 7.919196730800727e-06, "loss": 0.0387, "num_tokens": 62418083.0, "step": 819 }, { "epoch": 1.0224578914535247, "grad_norm": 0.1358666256385649, "learning_rate": 7.914085038028918e-06, "loss": 0.041, "num_tokens": 62494660.0, "step": 820 }, { "epoch": 1.0237055520898315, "grad_norm": 0.13858343996922898, "learning_rate": 7.908968967791262e-06, "loss": 0.0424, "num_tokens": 62571070.0, "step": 821 }, { "epoch": 1.0249532127261385, "grad_norm": 0.11844680272321946, "learning_rate": 7.903848529364738e-06, "loss": 0.0405, "num_tokens": 62648820.0, "step": 822 }, { "epoch": 1.0262008733624455, "grad_norm": 0.1252962753765857, "learning_rate": 7.89872373203425e-06, "loss": 0.0398, "num_tokens": 62726276.0, "step": 823 }, { "epoch": 1.0274485339987522, "grad_norm": 0.11847161903518785, "learning_rate": 7.893594585092601e-06, "loss": 0.0415, "num_tokens": 62802667.0, "step": 824 }, { "epoch": 1.0286961946350592, "grad_norm": 0.13434324948235268, "learning_rate": 7.888461097840494e-06, "loss": 0.041, "num_tokens": 62878138.0, "step": 825 }, { "epoch": 1.0299438552713662, "grad_norm": 0.12675768390987047, "learning_rate": 7.883323279586483e-06, "loss": 0.0393, "num_tokens": 62954633.0, "step": 826 }, { "epoch": 1.0311915159076732, "grad_norm": 0.11827927529499944, "learning_rate": 7.87818113964699e-06, "loss": 0.0391, "num_tokens": 63030685.0, "step": 827 }, { "epoch": 1.03243917654398, "grad_norm": 0.13125937485650266, "learning_rate": 7.873034687346268e-06, "loss": 0.0372, "num_tokens": 63106146.0, "step": 828 }, { "epoch": 1.033686837180287, "grad_norm": 0.11652636577512891, "learning_rate": 7.86788393201639e-06, "loss": 0.0379, "num_tokens": 63181401.0, "step": 829 }, { "epoch": 1.034934497816594, "grad_norm": 0.1341701967742822, "learning_rate": 7.862728882997236e-06, "loss": 0.0423, "num_tokens": 63257458.0, "step": 830 }, { "epoch": 1.0361821584529007, "grad_norm": 0.1364539989781808, "learning_rate": 7.857569549636462e-06, "loss": 0.0388, "num_tokens": 63334805.0, "step": 831 }, { "epoch": 1.0374298190892077, "grad_norm": 0.12536924010011452, "learning_rate": 7.852405941289503e-06, "loss": 0.0419, "num_tokens": 63411725.0, "step": 832 }, { "epoch": 1.0386774797255147, "grad_norm": 0.11900831109108075, "learning_rate": 7.847238067319542e-06, "loss": 0.0369, "num_tokens": 63487166.0, "step": 833 }, { "epoch": 1.0399251403618215, "grad_norm": 0.13712182486003496, "learning_rate": 7.842065937097495e-06, "loss": 0.0388, "num_tokens": 63561195.0, "step": 834 }, { "epoch": 1.0411728009981285, "grad_norm": 0.1322764156887544, "learning_rate": 7.836889560001997e-06, "loss": 0.0401, "num_tokens": 63638227.0, "step": 835 }, { "epoch": 1.0424204616344355, "grad_norm": 0.12845001330725533, "learning_rate": 7.831708945419383e-06, "loss": 0.0382, "num_tokens": 63714554.0, "step": 836 }, { "epoch": 1.0436681222707425, "grad_norm": 0.12086453985035646, "learning_rate": 7.826524102743678e-06, "loss": 0.0385, "num_tokens": 63791759.0, "step": 837 }, { "epoch": 1.0449157829070492, "grad_norm": 0.11543493690448661, "learning_rate": 7.821335041376565e-06, "loss": 0.0351, "num_tokens": 63867593.0, "step": 838 }, { "epoch": 1.0461634435433562, "grad_norm": 0.11861579198072326, "learning_rate": 7.816141770727381e-06, "loss": 0.0379, "num_tokens": 63942779.0, "step": 839 }, { "epoch": 1.0474111041796632, "grad_norm": 0.13188356504968343, "learning_rate": 7.810944300213095e-06, "loss": 0.0384, "num_tokens": 64020656.0, "step": 840 }, { "epoch": 1.04865876481597, "grad_norm": 0.1323045158713511, "learning_rate": 7.805742639258297e-06, "loss": 0.0376, "num_tokens": 64095353.0, "step": 841 }, { "epoch": 1.049906425452277, "grad_norm": 0.13242016376152926, "learning_rate": 7.800536797295164e-06, "loss": 0.042, "num_tokens": 64172670.0, "step": 842 }, { "epoch": 1.051154086088584, "grad_norm": 0.12019785744720937, "learning_rate": 7.795326783763463e-06, "loss": 0.0411, "num_tokens": 64252219.0, "step": 843 }, { "epoch": 1.0524017467248907, "grad_norm": 0.12945649987325733, "learning_rate": 7.790112608110523e-06, "loss": 0.0383, "num_tokens": 64327950.0, "step": 844 }, { "epoch": 1.0536494073611977, "grad_norm": 0.11589408734195418, "learning_rate": 7.784894279791224e-06, "loss": 0.0373, "num_tokens": 64403741.0, "step": 845 }, { "epoch": 1.0548970679975047, "grad_norm": 0.11991035427603171, "learning_rate": 7.779671808267968e-06, "loss": 0.0371, "num_tokens": 64479652.0, "step": 846 }, { "epoch": 1.0561447286338117, "grad_norm": 0.13203629959562918, "learning_rate": 7.774445203010676e-06, "loss": 0.0386, "num_tokens": 64555318.0, "step": 847 }, { "epoch": 1.0573923892701185, "grad_norm": 0.129045226504299, "learning_rate": 7.769214473496766e-06, "loss": 0.0397, "num_tokens": 64630854.0, "step": 848 }, { "epoch": 1.0586400499064255, "grad_norm": 0.13424488516410796, "learning_rate": 7.763979629211127e-06, "loss": 0.0405, "num_tokens": 64706946.0, "step": 849 }, { "epoch": 1.0598877105427325, "grad_norm": 0.1315607090534157, "learning_rate": 7.758740679646115e-06, "loss": 0.0379, "num_tokens": 64782647.0, "step": 850 }, { "epoch": 1.0611353711790392, "grad_norm": 0.12879285629017997, "learning_rate": 7.753497634301532e-06, "loss": 0.038, "num_tokens": 64859089.0, "step": 851 }, { "epoch": 1.0623830318153462, "grad_norm": 0.15633026157590887, "learning_rate": 7.748250502684601e-06, "loss": 0.0398, "num_tokens": 64935733.0, "step": 852 }, { "epoch": 1.0636306924516532, "grad_norm": 0.13634460601034967, "learning_rate": 7.742999294309959e-06, "loss": 0.0361, "num_tokens": 65010556.0, "step": 853 }, { "epoch": 1.06487835308796, "grad_norm": 0.12412517833563698, "learning_rate": 7.737744018699634e-06, "loss": 0.0405, "num_tokens": 65087047.0, "step": 854 }, { "epoch": 1.066126013724267, "grad_norm": 0.12716959844997922, "learning_rate": 7.732484685383027e-06, "loss": 0.0374, "num_tokens": 65162777.0, "step": 855 }, { "epoch": 1.067373674360574, "grad_norm": 0.12952911734554354, "learning_rate": 7.7272213038969e-06, "loss": 0.0387, "num_tokens": 65239214.0, "step": 856 }, { "epoch": 1.068621334996881, "grad_norm": 0.126084293916441, "learning_rate": 7.72195388378536e-06, "loss": 0.0393, "num_tokens": 65315610.0, "step": 857 }, { "epoch": 1.0698689956331877, "grad_norm": 0.12342840379376283, "learning_rate": 7.716682434599823e-06, "loss": 0.0428, "num_tokens": 65393681.0, "step": 858 }, { "epoch": 1.0711166562694947, "grad_norm": 0.13963961410762832, "learning_rate": 7.711406965899026e-06, "loss": 0.0404, "num_tokens": 65469270.0, "step": 859 }, { "epoch": 1.0723643169058017, "grad_norm": 0.11725006905637131, "learning_rate": 7.706127487248984e-06, "loss": 0.0466, "num_tokens": 65544990.0, "step": 860 }, { "epoch": 1.0736119775421085, "grad_norm": 0.14170062304512607, "learning_rate": 7.70084400822299e-06, "loss": 0.0398, "num_tokens": 65620506.0, "step": 861 }, { "epoch": 1.0748596381784155, "grad_norm": 0.12578447988533376, "learning_rate": 7.695556538401588e-06, "loss": 0.0376, "num_tokens": 65696157.0, "step": 862 }, { "epoch": 1.0761072988147224, "grad_norm": 0.12367582814838458, "learning_rate": 7.690265087372559e-06, "loss": 0.0431, "num_tokens": 65773019.0, "step": 863 }, { "epoch": 1.0773549594510294, "grad_norm": 0.14315074651628906, "learning_rate": 7.684969664730903e-06, "loss": 0.0391, "num_tokens": 65848646.0, "step": 864 }, { "epoch": 1.0786026200873362, "grad_norm": 0.12376004136236546, "learning_rate": 7.679670280078823e-06, "loss": 0.0402, "num_tokens": 65925176.0, "step": 865 }, { "epoch": 1.0798502807236432, "grad_norm": 0.13019320712059, "learning_rate": 7.674366943025705e-06, "loss": 0.0407, "num_tokens": 66002394.0, "step": 866 }, { "epoch": 1.0810979413599502, "grad_norm": 0.11984237762577243, "learning_rate": 7.669059663188099e-06, "loss": 0.0381, "num_tokens": 66078590.0, "step": 867 }, { "epoch": 1.082345601996257, "grad_norm": 0.12615694715416223, "learning_rate": 7.66374845018971e-06, "loss": 0.0358, "num_tokens": 66152912.0, "step": 868 }, { "epoch": 1.083593262632564, "grad_norm": 0.12041922594151447, "learning_rate": 7.658433313661372e-06, "loss": 0.0393, "num_tokens": 66228873.0, "step": 869 }, { "epoch": 1.084840923268871, "grad_norm": 0.12121834014527919, "learning_rate": 7.653114263241034e-06, "loss": 0.0381, "num_tokens": 66305201.0, "step": 870 }, { "epoch": 1.0860885839051777, "grad_norm": 0.13007390524350204, "learning_rate": 7.647791308573744e-06, "loss": 0.0394, "num_tokens": 66381316.0, "step": 871 }, { "epoch": 1.0873362445414847, "grad_norm": 0.12809810180480036, "learning_rate": 7.642464459311623e-06, "loss": 0.0396, "num_tokens": 66457844.0, "step": 872 }, { "epoch": 1.0885839051777917, "grad_norm": 0.12234620308401925, "learning_rate": 7.637133725113864e-06, "loss": 0.0386, "num_tokens": 66533263.0, "step": 873 }, { "epoch": 1.0898315658140985, "grad_norm": 0.11097029253021012, "learning_rate": 7.631799115646697e-06, "loss": 0.0372, "num_tokens": 66609039.0, "step": 874 }, { "epoch": 1.0910792264504054, "grad_norm": 0.1387492283712563, "learning_rate": 7.6264606405833805e-06, "loss": 0.0382, "num_tokens": 66684998.0, "step": 875 }, { "epoch": 1.0923268870867124, "grad_norm": 0.12281557435913792, "learning_rate": 7.621118309604186e-06, "loss": 0.0376, "num_tokens": 66760201.0, "step": 876 }, { "epoch": 1.0935745477230194, "grad_norm": 0.12597782436592558, "learning_rate": 7.615772132396373e-06, "loss": 0.0387, "num_tokens": 66837068.0, "step": 877 }, { "epoch": 1.0948222083593262, "grad_norm": 0.12304719888375368, "learning_rate": 7.6104221186541745e-06, "loss": 0.04, "num_tokens": 66912815.0, "step": 878 }, { "epoch": 1.0960698689956332, "grad_norm": 0.13032677225921888, "learning_rate": 7.6050682780787865e-06, "loss": 0.0432, "num_tokens": 66989780.0, "step": 879 }, { "epoch": 1.0973175296319402, "grad_norm": 0.1236460004101698, "learning_rate": 7.599710620378337e-06, "loss": 0.0405, "num_tokens": 67066176.0, "step": 880 }, { "epoch": 1.098565190268247, "grad_norm": 0.11979239248552905, "learning_rate": 7.594349155267879e-06, "loss": 0.0383, "num_tokens": 67142617.0, "step": 881 }, { "epoch": 1.099812850904554, "grad_norm": 0.12602615420228466, "learning_rate": 7.588983892469372e-06, "loss": 0.0383, "num_tokens": 67218428.0, "step": 882 }, { "epoch": 1.101060511540861, "grad_norm": 0.12720326733722198, "learning_rate": 7.583614841711657e-06, "loss": 0.0391, "num_tokens": 67293819.0, "step": 883 }, { "epoch": 1.102308172177168, "grad_norm": 0.13572936415520973, "learning_rate": 7.5782420127304466e-06, "loss": 0.0414, "num_tokens": 67370173.0, "step": 884 }, { "epoch": 1.1035558328134747, "grad_norm": 0.11987891385152093, "learning_rate": 7.572865415268303e-06, "loss": 0.0392, "num_tokens": 67446246.0, "step": 885 }, { "epoch": 1.1048034934497817, "grad_norm": 0.131690397693751, "learning_rate": 7.567485059074623e-06, "loss": 0.0387, "num_tokens": 67522135.0, "step": 886 }, { "epoch": 1.1060511540860887, "grad_norm": 0.15768309161114522, "learning_rate": 7.5621009539056175e-06, "loss": 0.0425, "num_tokens": 67599922.0, "step": 887 }, { "epoch": 1.1072988147223954, "grad_norm": 0.14441359221837843, "learning_rate": 7.556713109524301e-06, "loss": 0.0405, "num_tokens": 67677580.0, "step": 888 }, { "epoch": 1.1085464753587024, "grad_norm": 0.1292843222465948, "learning_rate": 7.551321535700456e-06, "loss": 0.0402, "num_tokens": 67754390.0, "step": 889 }, { "epoch": 1.1097941359950094, "grad_norm": 0.11922840726185915, "learning_rate": 7.545926242210643e-06, "loss": 0.0383, "num_tokens": 67830609.0, "step": 890 }, { "epoch": 1.1110417966313162, "grad_norm": 0.11666209441411698, "learning_rate": 7.540527238838156e-06, "loss": 0.0373, "num_tokens": 67905462.0, "step": 891 }, { "epoch": 1.1122894572676232, "grad_norm": 0.12935191115335293, "learning_rate": 7.535124535373019e-06, "loss": 0.0386, "num_tokens": 67982653.0, "step": 892 }, { "epoch": 1.1135371179039302, "grad_norm": 0.12276881001813526, "learning_rate": 7.529718141611972e-06, "loss": 0.0373, "num_tokens": 68057328.0, "step": 893 }, { "epoch": 1.114784778540237, "grad_norm": 0.13184667165067263, "learning_rate": 7.5243080673584345e-06, "loss": 0.0367, "num_tokens": 68132425.0, "step": 894 }, { "epoch": 1.116032439176544, "grad_norm": 0.11687424678949991, "learning_rate": 7.51889432242251e-06, "loss": 0.0387, "num_tokens": 68209624.0, "step": 895 }, { "epoch": 1.117280099812851, "grad_norm": 0.12821591356114023, "learning_rate": 7.513476916620952e-06, "loss": 0.0396, "num_tokens": 68286396.0, "step": 896 }, { "epoch": 1.118527760449158, "grad_norm": 0.13397330617441067, "learning_rate": 7.508055859777157e-06, "loss": 0.0375, "num_tokens": 68362383.0, "step": 897 }, { "epoch": 1.1197754210854647, "grad_norm": 0.11767653777640133, "learning_rate": 7.502631161721139e-06, "loss": 0.0369, "num_tokens": 68438133.0, "step": 898 }, { "epoch": 1.1210230817217717, "grad_norm": 0.12018970675793803, "learning_rate": 7.497202832289514e-06, "loss": 0.0393, "num_tokens": 68514149.0, "step": 899 }, { "epoch": 1.1222707423580787, "grad_norm": 0.124291033559986, "learning_rate": 7.4917708813254865e-06, "loss": 0.0389, "num_tokens": 68590799.0, "step": 900 }, { "epoch": 1.1235184029943854, "grad_norm": 0.11807610234269636, "learning_rate": 7.4863353186788234e-06, "loss": 0.0374, "num_tokens": 68666423.0, "step": 901 }, { "epoch": 1.1247660636306924, "grad_norm": 0.13303725093015237, "learning_rate": 7.480896154205844e-06, "loss": 0.038, "num_tokens": 68741602.0, "step": 902 }, { "epoch": 1.1260137242669994, "grad_norm": 0.11757569211552092, "learning_rate": 7.475453397769396e-06, "loss": 0.0399, "num_tokens": 68818640.0, "step": 903 }, { "epoch": 1.1272613849033064, "grad_norm": 0.13038770004184272, "learning_rate": 7.470007059238842e-06, "loss": 0.0379, "num_tokens": 68894885.0, "step": 904 }, { "epoch": 1.1285090455396132, "grad_norm": 0.13109249236061676, "learning_rate": 7.464557148490041e-06, "loss": 0.0396, "num_tokens": 68970453.0, "step": 905 }, { "epoch": 1.1297567061759202, "grad_norm": 0.12634016847293342, "learning_rate": 7.459103675405328e-06, "loss": 0.0372, "num_tokens": 69047236.0, "step": 906 }, { "epoch": 1.1310043668122272, "grad_norm": 0.12299085861262797, "learning_rate": 7.4536466498735e-06, "loss": 0.0397, "num_tokens": 69123903.0, "step": 907 }, { "epoch": 1.132252027448534, "grad_norm": 0.1319786544964183, "learning_rate": 7.44818608178979e-06, "loss": 0.0407, "num_tokens": 69200269.0, "step": 908 }, { "epoch": 1.133499688084841, "grad_norm": 0.13334686706378163, "learning_rate": 7.442721981055862e-06, "loss": 0.039, "num_tokens": 69275902.0, "step": 909 }, { "epoch": 1.134747348721148, "grad_norm": 0.12151092426485671, "learning_rate": 7.43725435757978e-06, "loss": 0.0377, "num_tokens": 69351293.0, "step": 910 }, { "epoch": 1.1359950093574547, "grad_norm": 0.11348410697636156, "learning_rate": 7.431783221275997e-06, "loss": 0.0375, "num_tokens": 69428894.0, "step": 911 }, { "epoch": 1.1372426699937617, "grad_norm": 0.11800100940339057, "learning_rate": 7.426308582065339e-06, "loss": 0.0377, "num_tokens": 69505235.0, "step": 912 }, { "epoch": 1.1384903306300687, "grad_norm": 0.13201001269167364, "learning_rate": 7.4208304498749825e-06, "loss": 0.0372, "num_tokens": 69582920.0, "step": 913 }, { "epoch": 1.1397379912663754, "grad_norm": 0.12114896597052012, "learning_rate": 7.415348834638433e-06, "loss": 0.0402, "num_tokens": 69659327.0, "step": 914 }, { "epoch": 1.1409856519026824, "grad_norm": 0.1333493333338715, "learning_rate": 7.40986374629552e-06, "loss": 0.0395, "num_tokens": 69735223.0, "step": 915 }, { "epoch": 1.1422333125389894, "grad_norm": 0.12347541882666985, "learning_rate": 7.404375194792365e-06, "loss": 0.0361, "num_tokens": 69810589.0, "step": 916 }, { "epoch": 1.1434809731752964, "grad_norm": 0.13077255784117334, "learning_rate": 7.398883190081368e-06, "loss": 0.0396, "num_tokens": 69886142.0, "step": 917 }, { "epoch": 1.1447286338116032, "grad_norm": 0.12866167128639208, "learning_rate": 7.3933877421211986e-06, "loss": 0.042, "num_tokens": 69962815.0, "step": 918 }, { "epoch": 1.1459762944479102, "grad_norm": 0.1426644408260711, "learning_rate": 7.387888860876763e-06, "loss": 0.0371, "num_tokens": 70038450.0, "step": 919 }, { "epoch": 1.1472239550842172, "grad_norm": 0.1191136878964363, "learning_rate": 7.382386556319193e-06, "loss": 0.0367, "num_tokens": 70113830.0, "step": 920 }, { "epoch": 1.1484716157205241, "grad_norm": 0.11766488295941847, "learning_rate": 7.376880838425832e-06, "loss": 0.0401, "num_tokens": 70190878.0, "step": 921 }, { "epoch": 1.149719276356831, "grad_norm": 0.1299870363233401, "learning_rate": 7.3713717171802106e-06, "loss": 0.0376, "num_tokens": 70267685.0, "step": 922 }, { "epoch": 1.150966936993138, "grad_norm": 0.1258019855180102, "learning_rate": 7.3658592025720285e-06, "loss": 0.0379, "num_tokens": 70346328.0, "step": 923 }, { "epoch": 1.152214597629445, "grad_norm": 0.12005546000064814, "learning_rate": 7.360343304597144e-06, "loss": 0.0405, "num_tokens": 70421909.0, "step": 924 }, { "epoch": 1.1534622582657517, "grad_norm": 0.13917434227685876, "learning_rate": 7.354824033257546e-06, "loss": 0.0388, "num_tokens": 70498763.0, "step": 925 }, { "epoch": 1.1547099189020587, "grad_norm": 0.1135457646599094, "learning_rate": 7.349301398561342e-06, "loss": 0.0374, "num_tokens": 70574235.0, "step": 926 }, { "epoch": 1.1559575795383656, "grad_norm": 0.12251659437468533, "learning_rate": 7.3437754105227365e-06, "loss": 0.0376, "num_tokens": 70650021.0, "step": 927 }, { "epoch": 1.1572052401746724, "grad_norm": 0.12140798784037611, "learning_rate": 7.3382460791620165e-06, "loss": 0.0399, "num_tokens": 70726089.0, "step": 928 }, { "epoch": 1.1584529008109794, "grad_norm": 0.13722375296866418, "learning_rate": 7.332713414505534e-06, "loss": 0.0408, "num_tokens": 70802890.0, "step": 929 }, { "epoch": 1.1597005614472864, "grad_norm": 0.12538243166812518, "learning_rate": 7.32717742658568e-06, "loss": 0.0371, "num_tokens": 70878378.0, "step": 930 }, { "epoch": 1.1609482220835932, "grad_norm": 0.11340208178725925, "learning_rate": 7.321638125440872e-06, "loss": 0.0361, "num_tokens": 70953286.0, "step": 931 }, { "epoch": 1.1621958827199002, "grad_norm": 0.12042441254724807, "learning_rate": 7.316095521115541e-06, "loss": 0.0396, "num_tokens": 71029347.0, "step": 932 }, { "epoch": 1.1634435433562071, "grad_norm": 0.12180711623275445, "learning_rate": 7.310549623660101e-06, "loss": 0.0394, "num_tokens": 71105407.0, "step": 933 }, { "epoch": 1.164691203992514, "grad_norm": 0.1342300366297676, "learning_rate": 7.305000443130943e-06, "loss": 0.0379, "num_tokens": 71181708.0, "step": 934 }, { "epoch": 1.165938864628821, "grad_norm": 0.12903385987396218, "learning_rate": 7.299447989590406e-06, "loss": 0.0419, "num_tokens": 71258943.0, "step": 935 }, { "epoch": 1.167186525265128, "grad_norm": 0.14085079025904748, "learning_rate": 7.293892273106768e-06, "loss": 0.0398, "num_tokens": 71334191.0, "step": 936 }, { "epoch": 1.1684341859014349, "grad_norm": 0.1321285243624536, "learning_rate": 7.2883333037542205e-06, "loss": 0.0374, "num_tokens": 71409277.0, "step": 937 }, { "epoch": 1.1696818465377417, "grad_norm": 0.11949825200704543, "learning_rate": 7.282771091612858e-06, "loss": 0.0374, "num_tokens": 71485087.0, "step": 938 }, { "epoch": 1.1709295071740486, "grad_norm": 0.13062252929041882, "learning_rate": 7.27720564676865e-06, "loss": 0.0401, "num_tokens": 71561286.0, "step": 939 }, { "epoch": 1.1721771678103556, "grad_norm": 0.1477865360798684, "learning_rate": 7.271636979313432e-06, "loss": 0.0398, "num_tokens": 71637031.0, "step": 940 }, { "epoch": 1.1734248284466626, "grad_norm": 0.13512349224041548, "learning_rate": 7.266065099344881e-06, "loss": 0.0404, "num_tokens": 71713245.0, "step": 941 }, { "epoch": 1.1746724890829694, "grad_norm": 0.14893654503163767, "learning_rate": 7.260490016966497e-06, "loss": 0.0389, "num_tokens": 71789690.0, "step": 942 }, { "epoch": 1.1759201497192764, "grad_norm": 0.12893022128126744, "learning_rate": 7.2549117422875925e-06, "loss": 0.0375, "num_tokens": 71864989.0, "step": 943 }, { "epoch": 1.1771678103555834, "grad_norm": 0.12549664193940147, "learning_rate": 7.249330285423265e-06, "loss": 0.04, "num_tokens": 71940192.0, "step": 944 }, { "epoch": 1.1784154709918901, "grad_norm": 0.12274636302470389, "learning_rate": 7.243745656494382e-06, "loss": 0.0379, "num_tokens": 72016076.0, "step": 945 }, { "epoch": 1.1796631316281971, "grad_norm": 0.11710254534160251, "learning_rate": 7.238157865627562e-06, "loss": 0.0365, "num_tokens": 72092219.0, "step": 946 }, { "epoch": 1.1809107922645041, "grad_norm": 0.12861913048299303, "learning_rate": 7.2325669229551636e-06, "loss": 0.0394, "num_tokens": 72169088.0, "step": 947 }, { "epoch": 1.182158452900811, "grad_norm": 0.11952840728072552, "learning_rate": 7.226972838615251e-06, "loss": 0.0375, "num_tokens": 72245176.0, "step": 948 }, { "epoch": 1.1834061135371179, "grad_norm": 0.13750612692860223, "learning_rate": 7.221375622751593e-06, "loss": 0.0396, "num_tokens": 72322210.0, "step": 949 }, { "epoch": 1.1846537741734249, "grad_norm": 0.1352346077782367, "learning_rate": 7.215775285513633e-06, "loss": 0.0384, "num_tokens": 72399608.0, "step": 950 }, { "epoch": 1.1859014348097316, "grad_norm": 0.11887468445347979, "learning_rate": 7.210171837056474e-06, "loss": 0.0358, "num_tokens": 72476194.0, "step": 951 }, { "epoch": 1.1871490954460386, "grad_norm": 0.11588279896504751, "learning_rate": 7.2045652875408614e-06, "loss": 0.0377, "num_tokens": 72552070.0, "step": 952 }, { "epoch": 1.1883967560823456, "grad_norm": 0.12255586677955618, "learning_rate": 7.198955647133167e-06, "loss": 0.0374, "num_tokens": 72627986.0, "step": 953 }, { "epoch": 1.1896444167186526, "grad_norm": 0.12035533003159458, "learning_rate": 7.193342926005362e-06, "loss": 0.0363, "num_tokens": 72703693.0, "step": 954 }, { "epoch": 1.1908920773549594, "grad_norm": 0.1258968428683712, "learning_rate": 7.187727134335006e-06, "loss": 0.0388, "num_tokens": 72780942.0, "step": 955 }, { "epoch": 1.1921397379912664, "grad_norm": 0.12871987765476092, "learning_rate": 7.182108282305231e-06, "loss": 0.0396, "num_tokens": 72857465.0, "step": 956 }, { "epoch": 1.1933873986275734, "grad_norm": 0.13203954993001707, "learning_rate": 7.176486380104707e-06, "loss": 0.0406, "num_tokens": 72933547.0, "step": 957 }, { "epoch": 1.1946350592638801, "grad_norm": 0.12670518611183942, "learning_rate": 7.1708614379276485e-06, "loss": 0.0374, "num_tokens": 73009045.0, "step": 958 }, { "epoch": 1.1958827199001871, "grad_norm": 0.13557051840818, "learning_rate": 7.165233465973771e-06, "loss": 0.0384, "num_tokens": 73084864.0, "step": 959 }, { "epoch": 1.1971303805364941, "grad_norm": 0.12840733753866795, "learning_rate": 7.159602474448292e-06, "loss": 0.0387, "num_tokens": 73160525.0, "step": 960 }, { "epoch": 1.1983780411728011, "grad_norm": 0.11239106491031488, "learning_rate": 7.1539684735618995e-06, "loss": 0.0349, "num_tokens": 73235850.0, "step": 961 }, { "epoch": 1.1996257018091079, "grad_norm": 0.12567537645801974, "learning_rate": 7.148331473530741e-06, "loss": 0.0386, "num_tokens": 73310704.0, "step": 962 }, { "epoch": 1.2008733624454149, "grad_norm": 0.12585028625143363, "learning_rate": 7.142691484576399e-06, "loss": 0.0377, "num_tokens": 73386704.0, "step": 963 }, { "epoch": 1.2021210230817219, "grad_norm": 0.12328153422534764, "learning_rate": 7.137048516925882e-06, "loss": 0.0404, "num_tokens": 73463619.0, "step": 964 }, { "epoch": 1.2033686837180286, "grad_norm": 0.13021530210661525, "learning_rate": 7.131402580811593e-06, "loss": 0.0381, "num_tokens": 73538731.0, "step": 965 }, { "epoch": 1.2046163443543356, "grad_norm": 0.12202075613733623, "learning_rate": 7.125753686471322e-06, "loss": 0.0385, "num_tokens": 73615795.0, "step": 966 }, { "epoch": 1.2058640049906426, "grad_norm": 0.12961790504212412, "learning_rate": 7.120101844148222e-06, "loss": 0.039, "num_tokens": 73692228.0, "step": 967 }, { "epoch": 1.2071116656269494, "grad_norm": 0.13052470730079757, "learning_rate": 7.1144470640907906e-06, "loss": 0.0389, "num_tokens": 73768783.0, "step": 968 }, { "epoch": 1.2083593262632564, "grad_norm": 0.134664952991497, "learning_rate": 7.1087893565528545e-06, "loss": 0.038, "num_tokens": 73844511.0, "step": 969 }, { "epoch": 1.2096069868995634, "grad_norm": 0.11415812603318583, "learning_rate": 7.103128731793546e-06, "loss": 0.0378, "num_tokens": 73920806.0, "step": 970 }, { "epoch": 1.2108546475358701, "grad_norm": 0.12776307077120927, "learning_rate": 7.097465200077289e-06, "loss": 0.0385, "num_tokens": 73996188.0, "step": 971 }, { "epoch": 1.2121023081721771, "grad_norm": 0.13377749841884895, "learning_rate": 7.0917987716737795e-06, "loss": 0.0388, "num_tokens": 74073219.0, "step": 972 }, { "epoch": 1.2133499688084841, "grad_norm": 0.13091768787237276, "learning_rate": 7.086129456857963e-06, "loss": 0.0373, "num_tokens": 74149596.0, "step": 973 }, { "epoch": 1.214597629444791, "grad_norm": 0.12906183805216612, "learning_rate": 7.080457265910022e-06, "loss": 0.0408, "num_tokens": 74225555.0, "step": 974 }, { "epoch": 1.2158452900810979, "grad_norm": 0.1264473460077226, "learning_rate": 7.074782209115356e-06, "loss": 0.0377, "num_tokens": 74302435.0, "step": 975 }, { "epoch": 1.2170929507174049, "grad_norm": 0.12036445295841532, "learning_rate": 7.069104296764553e-06, "loss": 0.0387, "num_tokens": 74379211.0, "step": 976 }, { "epoch": 1.2183406113537119, "grad_norm": 0.1365334912475307, "learning_rate": 7.0634235391533874e-06, "loss": 0.0356, "num_tokens": 74454460.0, "step": 977 }, { "epoch": 1.2195882719900186, "grad_norm": 0.11930134399444615, "learning_rate": 7.05773994658279e-06, "loss": 0.0394, "num_tokens": 74531587.0, "step": 978 }, { "epoch": 1.2208359326263256, "grad_norm": 0.12394477513731392, "learning_rate": 7.052053529358831e-06, "loss": 0.0382, "num_tokens": 74607528.0, "step": 979 }, { "epoch": 1.2220835932626326, "grad_norm": 0.12312553920411257, "learning_rate": 7.046364297792703e-06, "loss": 0.0363, "num_tokens": 74683276.0, "step": 980 }, { "epoch": 1.2233312538989396, "grad_norm": 0.13785776039233755, "learning_rate": 7.040672262200705e-06, "loss": 0.0385, "num_tokens": 74759411.0, "step": 981 }, { "epoch": 1.2245789145352464, "grad_norm": 0.11472386239465589, "learning_rate": 7.0349774329042135e-06, "loss": 0.0368, "num_tokens": 74835216.0, "step": 982 }, { "epoch": 1.2258265751715534, "grad_norm": 0.12173051063783977, "learning_rate": 7.02927982022968e-06, "loss": 0.0396, "num_tokens": 74911974.0, "step": 983 }, { "epoch": 1.2270742358078603, "grad_norm": 0.12457778411901807, "learning_rate": 7.023579434508596e-06, "loss": 0.041, "num_tokens": 74988340.0, "step": 984 }, { "epoch": 1.2283218964441671, "grad_norm": 0.12731680919814084, "learning_rate": 7.017876286077484e-06, "loss": 0.0391, "num_tokens": 75064637.0, "step": 985 }, { "epoch": 1.229569557080474, "grad_norm": 0.12533235166622664, "learning_rate": 7.012170385277877e-06, "loss": 0.039, "num_tokens": 75140639.0, "step": 986 }, { "epoch": 1.230817217716781, "grad_norm": 0.11252297683062269, "learning_rate": 7.006461742456297e-06, "loss": 0.0377, "num_tokens": 75216537.0, "step": 987 }, { "epoch": 1.2320648783530879, "grad_norm": 0.11919903473585805, "learning_rate": 7.000750367964239e-06, "loss": 0.0403, "num_tokens": 75293221.0, "step": 988 }, { "epoch": 1.2333125389893949, "grad_norm": 0.11769788430742996, "learning_rate": 6.99503627215815e-06, "loss": 0.0372, "num_tokens": 75368699.0, "step": 989 }, { "epoch": 1.2345601996257018, "grad_norm": 0.1279368792464733, "learning_rate": 6.989319465399415e-06, "loss": 0.0387, "num_tokens": 75444397.0, "step": 990 }, { "epoch": 1.2358078602620086, "grad_norm": 0.12159070141319064, "learning_rate": 6.983599958054331e-06, "loss": 0.0376, "num_tokens": 75519906.0, "step": 991 }, { "epoch": 1.2370555208983156, "grad_norm": 0.12029451059502534, "learning_rate": 6.977877760494094e-06, "loss": 0.04, "num_tokens": 75595546.0, "step": 992 }, { "epoch": 1.2383031815346226, "grad_norm": 0.13069178124066883, "learning_rate": 6.972152883094778e-06, "loss": 0.0378, "num_tokens": 75672691.0, "step": 993 }, { "epoch": 1.2395508421709296, "grad_norm": 0.12074943091247309, "learning_rate": 6.966425336237317e-06, "loss": 0.0371, "num_tokens": 75748316.0, "step": 994 }, { "epoch": 1.2407985028072364, "grad_norm": 0.1289634381789005, "learning_rate": 6.960695130307484e-06, "loss": 0.041, "num_tokens": 75824693.0, "step": 995 }, { "epoch": 1.2420461634435433, "grad_norm": 0.1256483249863802, "learning_rate": 6.954962275695871e-06, "loss": 0.04, "num_tokens": 75901978.0, "step": 996 }, { "epoch": 1.2432938240798503, "grad_norm": 0.12999985473947567, "learning_rate": 6.9492267827978824e-06, "loss": 0.0375, "num_tokens": 75978741.0, "step": 997 }, { "epoch": 1.244541484716157, "grad_norm": 0.11847867937232391, "learning_rate": 6.943488662013697e-06, "loss": 0.0375, "num_tokens": 76054336.0, "step": 998 }, { "epoch": 1.245789145352464, "grad_norm": 0.12241451988551497, "learning_rate": 6.93774792374826e-06, "loss": 0.0388, "num_tokens": 76131145.0, "step": 999 }, { "epoch": 1.247036805988771, "grad_norm": 0.13576458392659807, "learning_rate": 6.93200457841127e-06, "loss": 0.0371, "num_tokens": 76207946.0, "step": 1000 }, { "epoch": 1.248284466625078, "grad_norm": 0.1357083738319804, "learning_rate": 6.9262586364171455e-06, "loss": 0.0407, "num_tokens": 76284678.0, "step": 1001 }, { "epoch": 1.2495321272613849, "grad_norm": 0.1417184857589529, "learning_rate": 6.920510108185016e-06, "loss": 0.0369, "num_tokens": 76361020.0, "step": 1002 }, { "epoch": 1.2507797878976918, "grad_norm": 0.12259900282643452, "learning_rate": 6.9147590041387e-06, "loss": 0.0371, "num_tokens": 76437197.0, "step": 1003 }, { "epoch": 1.2520274485339988, "grad_norm": 0.11833456048304121, "learning_rate": 6.909005334706688e-06, "loss": 0.0387, "num_tokens": 76513600.0, "step": 1004 }, { "epoch": 1.2532751091703056, "grad_norm": 0.1362755912509239, "learning_rate": 6.903249110322123e-06, "loss": 0.0461, "num_tokens": 76590862.0, "step": 1005 }, { "epoch": 1.2545227698066126, "grad_norm": 0.12371120963572597, "learning_rate": 6.897490341422779e-06, "loss": 0.036, "num_tokens": 76667344.0, "step": 1006 }, { "epoch": 1.2557704304429196, "grad_norm": 0.12643531689407858, "learning_rate": 6.8917290384510435e-06, "loss": 0.0391, "num_tokens": 76743875.0, "step": 1007 }, { "epoch": 1.2570180910792264, "grad_norm": 0.13473742006949435, "learning_rate": 6.885965211853902e-06, "loss": 0.0459, "num_tokens": 76819364.0, "step": 1008 }, { "epoch": 1.2582657517155333, "grad_norm": 0.13094378837230844, "learning_rate": 6.8801988720829134e-06, "loss": 0.0377, "num_tokens": 76894778.0, "step": 1009 }, { "epoch": 1.2595134123518403, "grad_norm": 0.13013560077380523, "learning_rate": 6.874430029594194e-06, "loss": 0.0382, "num_tokens": 76971000.0, "step": 1010 }, { "epoch": 1.260761072988147, "grad_norm": 0.11496471028200521, "learning_rate": 6.8686586948483995e-06, "loss": 0.0378, "num_tokens": 77046773.0, "step": 1011 }, { "epoch": 1.262008733624454, "grad_norm": 0.12396329610612375, "learning_rate": 6.862884878310705e-06, "loss": 0.0381, "num_tokens": 77122750.0, "step": 1012 }, { "epoch": 1.263256394260761, "grad_norm": 0.12570886085428049, "learning_rate": 6.8571085904507825e-06, "loss": 0.0368, "num_tokens": 77198494.0, "step": 1013 }, { "epoch": 1.264504054897068, "grad_norm": 0.11028117409017361, "learning_rate": 6.8513298417427895e-06, "loss": 0.0389, "num_tokens": 77273307.0, "step": 1014 }, { "epoch": 1.2657517155333748, "grad_norm": 0.1471893491163991, "learning_rate": 6.845548642665347e-06, "loss": 0.0388, "num_tokens": 77349388.0, "step": 1015 }, { "epoch": 1.2669993761696818, "grad_norm": 0.11262904086099527, "learning_rate": 6.839765003701511e-06, "loss": 0.0379, "num_tokens": 77426148.0, "step": 1016 }, { "epoch": 1.2682470368059888, "grad_norm": 0.13361856137063885, "learning_rate": 6.833978935338772e-06, "loss": 0.0379, "num_tokens": 77501759.0, "step": 1017 }, { "epoch": 1.2694946974422958, "grad_norm": 0.12279484859518144, "learning_rate": 6.828190448069016e-06, "loss": 0.0406, "num_tokens": 77579480.0, "step": 1018 }, { "epoch": 1.2707423580786026, "grad_norm": 0.12242840320310547, "learning_rate": 6.822399552388523e-06, "loss": 0.0359, "num_tokens": 77655675.0, "step": 1019 }, { "epoch": 1.2719900187149096, "grad_norm": 0.116893287227689, "learning_rate": 6.816606258797936e-06, "loss": 0.0381, "num_tokens": 77733976.0, "step": 1020 }, { "epoch": 1.2732376793512166, "grad_norm": 0.1222409289259211, "learning_rate": 6.810810577802249e-06, "loss": 0.0378, "num_tokens": 77809768.0, "step": 1021 }, { "epoch": 1.2744853399875233, "grad_norm": 0.12443386098402114, "learning_rate": 6.8050125199107835e-06, "loss": 0.0383, "num_tokens": 77886283.0, "step": 1022 }, { "epoch": 1.2757330006238303, "grad_norm": 0.12792841056094306, "learning_rate": 6.799212095637169e-06, "loss": 0.038, "num_tokens": 77962705.0, "step": 1023 }, { "epoch": 1.2769806612601373, "grad_norm": 0.12853700012331887, "learning_rate": 6.7934093154993285e-06, "loss": 0.0381, "num_tokens": 78038286.0, "step": 1024 }, { "epoch": 1.278228321896444, "grad_norm": 0.1204293670396625, "learning_rate": 6.787604190019456e-06, "loss": 0.0379, "num_tokens": 78113897.0, "step": 1025 }, { "epoch": 1.279475982532751, "grad_norm": 0.16278447289558357, "learning_rate": 6.781796729724001e-06, "loss": 0.039, "num_tokens": 78190823.0, "step": 1026 }, { "epoch": 1.280723643169058, "grad_norm": 0.14049403625476514, "learning_rate": 6.775986945143641e-06, "loss": 0.0365, "num_tokens": 78266929.0, "step": 1027 }, { "epoch": 1.2819713038053648, "grad_norm": 0.12034977499671139, "learning_rate": 6.770174846813273e-06, "loss": 0.0361, "num_tokens": 78343744.0, "step": 1028 }, { "epoch": 1.2832189644416718, "grad_norm": 0.11781958413703834, "learning_rate": 6.7643604452719894e-06, "loss": 0.0359, "num_tokens": 78418807.0, "step": 1029 }, { "epoch": 1.2844666250779788, "grad_norm": 0.12121542127811726, "learning_rate": 6.758543751063055e-06, "loss": 0.0382, "num_tokens": 78496287.0, "step": 1030 }, { "epoch": 1.2857142857142856, "grad_norm": 0.11962328271317485, "learning_rate": 6.752724774733899e-06, "loss": 0.0395, "num_tokens": 78571959.0, "step": 1031 }, { "epoch": 1.2869619463505926, "grad_norm": 0.11711191648751425, "learning_rate": 6.746903526836079e-06, "loss": 0.0368, "num_tokens": 78647917.0, "step": 1032 }, { "epoch": 1.2882096069868996, "grad_norm": 0.11174765287606012, "learning_rate": 6.741080017925279e-06, "loss": 0.038, "num_tokens": 78723022.0, "step": 1033 }, { "epoch": 1.2894572676232066, "grad_norm": 0.14213629513727627, "learning_rate": 6.735254258561281e-06, "loss": 0.041, "num_tokens": 78799598.0, "step": 1034 }, { "epoch": 1.2907049282595136, "grad_norm": 0.11404088303991229, "learning_rate": 6.729426259307948e-06, "loss": 0.035, "num_tokens": 78875225.0, "step": 1035 }, { "epoch": 1.2919525888958203, "grad_norm": 0.1259336273419687, "learning_rate": 6.723596030733204e-06, "loss": 0.0408, "num_tokens": 78953031.0, "step": 1036 }, { "epoch": 1.2932002495321273, "grad_norm": 0.15791825227165843, "learning_rate": 6.717763583409016e-06, "loss": 0.0614, "num_tokens": 79031801.0, "step": 1037 }, { "epoch": 1.2944479101684343, "grad_norm": 0.1304087409910658, "learning_rate": 6.711928927911373e-06, "loss": 0.0387, "num_tokens": 79107509.0, "step": 1038 }, { "epoch": 1.295695570804741, "grad_norm": 0.12600021318695712, "learning_rate": 6.7060920748202674e-06, "loss": 0.0346, "num_tokens": 79184034.0, "step": 1039 }, { "epoch": 1.296943231441048, "grad_norm": 0.12435271412670368, "learning_rate": 6.700253034719684e-06, "loss": 0.0381, "num_tokens": 79263280.0, "step": 1040 }, { "epoch": 1.298190892077355, "grad_norm": 0.11823691658700941, "learning_rate": 6.694411818197561e-06, "loss": 0.0418, "num_tokens": 79339787.0, "step": 1041 }, { "epoch": 1.2994385527136618, "grad_norm": 0.13713072747764696, "learning_rate": 6.688568435845792e-06, "loss": 0.0386, "num_tokens": 79414932.0, "step": 1042 }, { "epoch": 1.3006862133499688, "grad_norm": 0.12634187217101714, "learning_rate": 6.682722898260195e-06, "loss": 0.0377, "num_tokens": 79491173.0, "step": 1043 }, { "epoch": 1.3019338739862758, "grad_norm": 0.12086406055618043, "learning_rate": 6.676875216040498e-06, "loss": 0.0373, "num_tokens": 79567357.0, "step": 1044 }, { "epoch": 1.3031815346225826, "grad_norm": 0.11748638779428121, "learning_rate": 6.671025399790315e-06, "loss": 0.0349, "num_tokens": 79643546.0, "step": 1045 }, { "epoch": 1.3044291952588896, "grad_norm": 0.12900997225280608, "learning_rate": 6.66517346011713e-06, "loss": 0.0374, "num_tokens": 79719107.0, "step": 1046 }, { "epoch": 1.3056768558951966, "grad_norm": 0.1132473938030605, "learning_rate": 6.659319407632282e-06, "loss": 0.0391, "num_tokens": 79795324.0, "step": 1047 }, { "epoch": 1.3069245165315033, "grad_norm": 0.12056242211992067, "learning_rate": 6.653463252950933e-06, "loss": 0.0381, "num_tokens": 79872580.0, "step": 1048 }, { "epoch": 1.3081721771678103, "grad_norm": 0.12298594861515041, "learning_rate": 6.647605006692066e-06, "loss": 0.0372, "num_tokens": 79948484.0, "step": 1049 }, { "epoch": 1.3094198378041173, "grad_norm": 0.1224135552748089, "learning_rate": 6.641744679478448e-06, "loss": 0.0385, "num_tokens": 80024409.0, "step": 1050 }, { "epoch": 1.310667498440424, "grad_norm": 0.1238426241135653, "learning_rate": 6.635882281936625e-06, "loss": 0.0369, "num_tokens": 80100285.0, "step": 1051 }, { "epoch": 1.311915159076731, "grad_norm": 0.12678498517610146, "learning_rate": 6.630017824696898e-06, "loss": 0.0374, "num_tokens": 80175891.0, "step": 1052 }, { "epoch": 1.313162819713038, "grad_norm": 0.11416544137866638, "learning_rate": 6.624151318393298e-06, "loss": 0.0364, "num_tokens": 80250985.0, "step": 1053 }, { "epoch": 1.314410480349345, "grad_norm": 0.1186845318691531, "learning_rate": 6.618282773663576e-06, "loss": 0.0369, "num_tokens": 80326912.0, "step": 1054 }, { "epoch": 1.315658140985652, "grad_norm": 0.11768290749010996, "learning_rate": 6.612412201149175e-06, "loss": 0.0356, "num_tokens": 80402020.0, "step": 1055 }, { "epoch": 1.3169058016219588, "grad_norm": 0.11763439527023663, "learning_rate": 6.6065396114952195e-06, "loss": 0.0358, "num_tokens": 80477106.0, "step": 1056 }, { "epoch": 1.3181534622582658, "grad_norm": 0.10917918873769969, "learning_rate": 6.600665015350487e-06, "loss": 0.0362, "num_tokens": 80552797.0, "step": 1057 }, { "epoch": 1.3194011228945728, "grad_norm": 0.1297261226982308, "learning_rate": 6.594788423367399e-06, "loss": 0.0379, "num_tokens": 80630455.0, "step": 1058 }, { "epoch": 1.3206487835308796, "grad_norm": 0.1357107428559246, "learning_rate": 6.588909846201992e-06, "loss": 0.0359, "num_tokens": 80706916.0, "step": 1059 }, { "epoch": 1.3218964441671865, "grad_norm": 0.11167341542497014, "learning_rate": 6.583029294513902e-06, "loss": 0.0382, "num_tokens": 80783306.0, "step": 1060 }, { "epoch": 1.3231441048034935, "grad_norm": 0.126267003510051, "learning_rate": 6.577146778966347e-06, "loss": 0.0368, "num_tokens": 80858263.0, "step": 1061 }, { "epoch": 1.3243917654398003, "grad_norm": 0.11755835211629956, "learning_rate": 6.571262310226108e-06, "loss": 0.0363, "num_tokens": 80934605.0, "step": 1062 }, { "epoch": 1.3256394260761073, "grad_norm": 0.12868435000077522, "learning_rate": 6.565375898963503e-06, "loss": 0.0389, "num_tokens": 81010777.0, "step": 1063 }, { "epoch": 1.3268870867124143, "grad_norm": 0.13643865231728688, "learning_rate": 6.5594875558523755e-06, "loss": 0.0375, "num_tokens": 81086979.0, "step": 1064 }, { "epoch": 1.328134747348721, "grad_norm": 0.11784928737975484, "learning_rate": 6.553597291570071e-06, "loss": 0.0383, "num_tokens": 81162843.0, "step": 1065 }, { "epoch": 1.329382407985028, "grad_norm": 0.14182537453632305, "learning_rate": 6.547705116797422e-06, "loss": 0.0402, "num_tokens": 81239935.0, "step": 1066 }, { "epoch": 1.330630068621335, "grad_norm": 0.12586204808908688, "learning_rate": 6.5418110422187156e-06, "loss": 0.0383, "num_tokens": 81316699.0, "step": 1067 }, { "epoch": 1.3318777292576418, "grad_norm": 0.12092515001770596, "learning_rate": 6.535915078521697e-06, "loss": 0.0405, "num_tokens": 81392842.0, "step": 1068 }, { "epoch": 1.3331253898939488, "grad_norm": 0.12527079685312903, "learning_rate": 6.530017236397529e-06, "loss": 0.0385, "num_tokens": 81468914.0, "step": 1069 }, { "epoch": 1.3343730505302558, "grad_norm": 0.12405547111378336, "learning_rate": 6.52411752654078e-06, "loss": 0.0403, "num_tokens": 81546381.0, "step": 1070 }, { "epoch": 1.3356207111665626, "grad_norm": 0.12043411317614498, "learning_rate": 6.518215959649409e-06, "loss": 0.036, "num_tokens": 81621864.0, "step": 1071 }, { "epoch": 1.3368683718028695, "grad_norm": 0.11269431565422956, "learning_rate": 6.512312546424739e-06, "loss": 0.0367, "num_tokens": 81697399.0, "step": 1072 }, { "epoch": 1.3381160324391765, "grad_norm": 0.131008457319013, "learning_rate": 6.506407297571445e-06, "loss": 0.0387, "num_tokens": 81773914.0, "step": 1073 }, { "epoch": 1.3393636930754835, "grad_norm": 0.12914008887166922, "learning_rate": 6.500500223797526e-06, "loss": 0.0394, "num_tokens": 81851041.0, "step": 1074 }, { "epoch": 1.3406113537117905, "grad_norm": 0.11969733474999027, "learning_rate": 6.494591335814292e-06, "loss": 0.0373, "num_tokens": 81926575.0, "step": 1075 }, { "epoch": 1.3418590143480973, "grad_norm": 0.1256605085911434, "learning_rate": 6.488680644336344e-06, "loss": 0.0356, "num_tokens": 82001949.0, "step": 1076 }, { "epoch": 1.3431066749844043, "grad_norm": 0.12072878078758512, "learning_rate": 6.482768160081553e-06, "loss": 0.0372, "num_tokens": 82082965.0, "step": 1077 }, { "epoch": 1.3443543356207113, "grad_norm": 0.11564706297590938, "learning_rate": 6.4768538937710364e-06, "loss": 0.0364, "num_tokens": 82159572.0, "step": 1078 }, { "epoch": 1.345601996257018, "grad_norm": 0.11810783963495697, "learning_rate": 6.470937856129152e-06, "loss": 0.0362, "num_tokens": 82236392.0, "step": 1079 }, { "epoch": 1.346849656893325, "grad_norm": 0.12243552939696348, "learning_rate": 6.465020057883461e-06, "loss": 0.0402, "num_tokens": 82312762.0, "step": 1080 }, { "epoch": 1.348097317529632, "grad_norm": 0.12176519886783935, "learning_rate": 6.45910050976472e-06, "loss": 0.0356, "num_tokens": 82388841.0, "step": 1081 }, { "epoch": 1.3493449781659388, "grad_norm": 0.10978489412644651, "learning_rate": 6.45317922250686e-06, "loss": 0.0356, "num_tokens": 82464488.0, "step": 1082 }, { "epoch": 1.3505926388022458, "grad_norm": 0.11917674192716878, "learning_rate": 6.447256206846963e-06, "loss": 0.0385, "num_tokens": 82541359.0, "step": 1083 }, { "epoch": 1.3518402994385528, "grad_norm": 0.12641710504405018, "learning_rate": 6.44133147352525e-06, "loss": 0.0389, "num_tokens": 82620545.0, "step": 1084 }, { "epoch": 1.3530879600748595, "grad_norm": 0.1235769738581032, "learning_rate": 6.4354050332850505e-06, "loss": 0.0365, "num_tokens": 82696305.0, "step": 1085 }, { "epoch": 1.3543356207111665, "grad_norm": 0.11868272459465616, "learning_rate": 6.429476896872793e-06, "loss": 0.0366, "num_tokens": 82772823.0, "step": 1086 }, { "epoch": 1.3555832813474735, "grad_norm": 0.11716548574142212, "learning_rate": 6.4235470750379794e-06, "loss": 0.0378, "num_tokens": 82848581.0, "step": 1087 }, { "epoch": 1.3568309419837803, "grad_norm": 0.12977983514242444, "learning_rate": 6.4176155785331705e-06, "loss": 0.0373, "num_tokens": 82922803.0, "step": 1088 }, { "epoch": 1.3580786026200873, "grad_norm": 0.11983262533415129, "learning_rate": 6.411682418113961e-06, "loss": 0.0369, "num_tokens": 82999543.0, "step": 1089 }, { "epoch": 1.3593262632563943, "grad_norm": 0.10881217167861341, "learning_rate": 6.405747604538965e-06, "loss": 0.0391, "num_tokens": 83075716.0, "step": 1090 }, { "epoch": 1.3605739238927013, "grad_norm": 0.11312143319312479, "learning_rate": 6.399811148569794e-06, "loss": 0.037, "num_tokens": 83152235.0, "step": 1091 }, { "epoch": 1.361821584529008, "grad_norm": 0.12868490609288766, "learning_rate": 6.393873060971036e-06, "loss": 0.0382, "num_tokens": 83227490.0, "step": 1092 }, { "epoch": 1.363069245165315, "grad_norm": 0.11394818001578064, "learning_rate": 6.3879333525102375e-06, "loss": 0.0364, "num_tokens": 83304281.0, "step": 1093 }, { "epoch": 1.364316905801622, "grad_norm": 0.13139308054317986, "learning_rate": 6.381992033957889e-06, "loss": 0.0369, "num_tokens": 83381463.0, "step": 1094 }, { "epoch": 1.365564566437929, "grad_norm": 0.12142619982158942, "learning_rate": 6.376049116087393e-06, "loss": 0.0412, "num_tokens": 83458122.0, "step": 1095 }, { "epoch": 1.3668122270742358, "grad_norm": 0.12819557314527608, "learning_rate": 6.370104609675058e-06, "loss": 0.0375, "num_tokens": 83534927.0, "step": 1096 }, { "epoch": 1.3680598877105428, "grad_norm": 0.13688859870267972, "learning_rate": 6.364158525500069e-06, "loss": 0.0377, "num_tokens": 83611556.0, "step": 1097 }, { "epoch": 1.3693075483468498, "grad_norm": 0.1139664206178106, "learning_rate": 6.358210874344476e-06, "loss": 0.0377, "num_tokens": 83688500.0, "step": 1098 }, { "epoch": 1.3705552089831565, "grad_norm": 0.12878963288602277, "learning_rate": 6.352261666993167e-06, "loss": 0.0398, "num_tokens": 83764887.0, "step": 1099 }, { "epoch": 1.3718028696194635, "grad_norm": 0.13187044333532477, "learning_rate": 6.346310914233854e-06, "loss": 0.0402, "num_tokens": 83840573.0, "step": 1100 }, { "epoch": 1.3730505302557705, "grad_norm": 0.13218470782925437, "learning_rate": 6.340358626857049e-06, "loss": 0.0369, "num_tokens": 83915835.0, "step": 1101 }, { "epoch": 1.3742981908920773, "grad_norm": 0.123600988524384, "learning_rate": 6.334404815656049e-06, "loss": 0.037, "num_tokens": 83993107.0, "step": 1102 }, { "epoch": 1.3755458515283843, "grad_norm": 0.1141887638770041, "learning_rate": 6.328449491426914e-06, "loss": 0.0364, "num_tokens": 84069739.0, "step": 1103 }, { "epoch": 1.3767935121646913, "grad_norm": 0.11853940734782815, "learning_rate": 6.322492664968446e-06, "loss": 0.0376, "num_tokens": 84145150.0, "step": 1104 }, { "epoch": 1.378041172800998, "grad_norm": 0.12448334472309965, "learning_rate": 6.316534347082173e-06, "loss": 0.0369, "num_tokens": 84221521.0, "step": 1105 }, { "epoch": 1.379288833437305, "grad_norm": 0.11289826691946864, "learning_rate": 6.310574548572325e-06, "loss": 0.0339, "num_tokens": 84296673.0, "step": 1106 }, { "epoch": 1.380536494073612, "grad_norm": 0.11584642156593683, "learning_rate": 6.304613280245816e-06, "loss": 0.0368, "num_tokens": 84373020.0, "step": 1107 }, { "epoch": 1.3817841547099188, "grad_norm": 0.11898007042409253, "learning_rate": 6.298650552912233e-06, "loss": 0.0383, "num_tokens": 84449012.0, "step": 1108 }, { "epoch": 1.3830318153462258, "grad_norm": 0.13178915624086404, "learning_rate": 6.292686377383797e-06, "loss": 0.0379, "num_tokens": 84525080.0, "step": 1109 }, { "epoch": 1.3842794759825328, "grad_norm": 0.12485077614334157, "learning_rate": 6.286720764475365e-06, "loss": 0.039, "num_tokens": 84602186.0, "step": 1110 }, { "epoch": 1.3855271366188397, "grad_norm": 0.12594599877140716, "learning_rate": 6.280753725004395e-06, "loss": 0.04, "num_tokens": 84677075.0, "step": 1111 }, { "epoch": 1.3867747972551465, "grad_norm": 0.12855159432742083, "learning_rate": 6.274785269790932e-06, "loss": 0.0368, "num_tokens": 84752554.0, "step": 1112 }, { "epoch": 1.3880224578914535, "grad_norm": 0.11427665618225598, "learning_rate": 6.268815409657592e-06, "loss": 0.0373, "num_tokens": 84828980.0, "step": 1113 }, { "epoch": 1.3892701185277605, "grad_norm": 0.11937242460768584, "learning_rate": 6.262844155429533e-06, "loss": 0.0373, "num_tokens": 84905393.0, "step": 1114 }, { "epoch": 1.3905177791640675, "grad_norm": 0.12860043075057506, "learning_rate": 6.256871517934445e-06, "loss": 0.0374, "num_tokens": 84982169.0, "step": 1115 }, { "epoch": 1.3917654398003743, "grad_norm": 0.12212869867413878, "learning_rate": 6.2508975080025254e-06, "loss": 0.0388, "num_tokens": 85059133.0, "step": 1116 }, { "epoch": 1.3930131004366813, "grad_norm": 0.11678301135662582, "learning_rate": 6.24492213646646e-06, "loss": 0.0392, "num_tokens": 85135719.0, "step": 1117 }, { "epoch": 1.3942607610729882, "grad_norm": 0.12535257252564777, "learning_rate": 6.2389454141614024e-06, "loss": 0.0381, "num_tokens": 85211509.0, "step": 1118 }, { "epoch": 1.395508421709295, "grad_norm": 0.13291682129274268, "learning_rate": 6.232967351924959e-06, "loss": 0.0414, "num_tokens": 85287979.0, "step": 1119 }, { "epoch": 1.396756082345602, "grad_norm": 0.12295637500049969, "learning_rate": 6.226987960597161e-06, "loss": 0.0366, "num_tokens": 85364645.0, "step": 1120 }, { "epoch": 1.398003742981909, "grad_norm": 0.11736989037967782, "learning_rate": 6.22100725102045e-06, "loss": 0.0365, "num_tokens": 85440252.0, "step": 1121 }, { "epoch": 1.3992514036182158, "grad_norm": 0.11734689556837297, "learning_rate": 6.215025234039667e-06, "loss": 0.0381, "num_tokens": 85517224.0, "step": 1122 }, { "epoch": 1.4004990642545228, "grad_norm": 0.12117102979297299, "learning_rate": 6.209041920502012e-06, "loss": 0.041, "num_tokens": 85593781.0, "step": 1123 }, { "epoch": 1.4017467248908297, "grad_norm": 0.12125413473547068, "learning_rate": 6.203057321257041e-06, "loss": 0.0394, "num_tokens": 85669972.0, "step": 1124 }, { "epoch": 1.4029943855271365, "grad_norm": 0.12806121362449266, "learning_rate": 6.197071447156643e-06, "loss": 0.0429, "num_tokens": 85748190.0, "step": 1125 }, { "epoch": 1.4042420461634435, "grad_norm": 0.13610898293273987, "learning_rate": 6.191084309055018e-06, "loss": 0.0393, "num_tokens": 85824528.0, "step": 1126 }, { "epoch": 1.4054897067997505, "grad_norm": 0.1338585027234358, "learning_rate": 6.185095917808654e-06, "loss": 0.0385, "num_tokens": 85901267.0, "step": 1127 }, { "epoch": 1.4067373674360573, "grad_norm": 0.1224560732061581, "learning_rate": 6.179106284276315e-06, "loss": 0.0373, "num_tokens": 85976334.0, "step": 1128 }, { "epoch": 1.4079850280723643, "grad_norm": 0.12016867501292519, "learning_rate": 6.173115419319019e-06, "loss": 0.035, "num_tokens": 86050677.0, "step": 1129 }, { "epoch": 1.4092326887086712, "grad_norm": 0.1159217234754072, "learning_rate": 6.167123333800014e-06, "loss": 0.0372, "num_tokens": 86127558.0, "step": 1130 }, { "epoch": 1.4104803493449782, "grad_norm": 0.10442739115930703, "learning_rate": 6.161130038584762e-06, "loss": 0.0369, "num_tokens": 86204395.0, "step": 1131 }, { "epoch": 1.4117280099812852, "grad_norm": 0.11747181262627125, "learning_rate": 6.155135544540917e-06, "loss": 0.036, "num_tokens": 86280341.0, "step": 1132 }, { "epoch": 1.412975670617592, "grad_norm": 0.12634337046270444, "learning_rate": 6.1491398625383116e-06, "loss": 0.0379, "num_tokens": 86356632.0, "step": 1133 }, { "epoch": 1.414223331253899, "grad_norm": 0.12959775986090372, "learning_rate": 6.143143003448929e-06, "loss": 0.0399, "num_tokens": 86433774.0, "step": 1134 }, { "epoch": 1.415470991890206, "grad_norm": 0.11959740448330311, "learning_rate": 6.1371449781468835e-06, "loss": 0.0373, "num_tokens": 86509784.0, "step": 1135 }, { "epoch": 1.4167186525265127, "grad_norm": 0.1371476505333162, "learning_rate": 6.131145797508414e-06, "loss": 0.0395, "num_tokens": 86586524.0, "step": 1136 }, { "epoch": 1.4179663131628197, "grad_norm": 0.11223564174314306, "learning_rate": 6.125145472411845e-06, "loss": 0.0376, "num_tokens": 86663862.0, "step": 1137 }, { "epoch": 1.4192139737991267, "grad_norm": 0.12820345748805365, "learning_rate": 6.1191440137375775e-06, "loss": 0.0377, "num_tokens": 86740239.0, "step": 1138 }, { "epoch": 1.4204616344354335, "grad_norm": 0.11565453169043415, "learning_rate": 6.113141432368075e-06, "loss": 0.0377, "num_tokens": 86817067.0, "step": 1139 }, { "epoch": 1.4217092950717405, "grad_norm": 0.12044599920690999, "learning_rate": 6.107137739187827e-06, "loss": 0.0357, "num_tokens": 86892431.0, "step": 1140 }, { "epoch": 1.4229569557080475, "grad_norm": 0.13067772165236136, "learning_rate": 6.101132945083347e-06, "loss": 0.0395, "num_tokens": 86968048.0, "step": 1141 }, { "epoch": 1.4242046163443542, "grad_norm": 0.11788372463688408, "learning_rate": 6.095127060943141e-06, "loss": 0.0362, "num_tokens": 87042935.0, "step": 1142 }, { "epoch": 1.4254522769806612, "grad_norm": 0.11002089825894382, "learning_rate": 6.089120097657692e-06, "loss": 0.0383, "num_tokens": 87120151.0, "step": 1143 }, { "epoch": 1.4266999376169682, "grad_norm": 0.12686988054618584, "learning_rate": 6.083112066119439e-06, "loss": 0.0385, "num_tokens": 87196229.0, "step": 1144 }, { "epoch": 1.427947598253275, "grad_norm": 0.12173931969583418, "learning_rate": 6.077102977222763e-06, "loss": 0.0406, "num_tokens": 87273162.0, "step": 1145 }, { "epoch": 1.429195258889582, "grad_norm": 0.12795374944584967, "learning_rate": 6.0710928418639515e-06, "loss": 0.0384, "num_tokens": 87349259.0, "step": 1146 }, { "epoch": 1.430442919525889, "grad_norm": 0.14248994453706956, "learning_rate": 6.065081670941204e-06, "loss": 0.04, "num_tokens": 87426901.0, "step": 1147 }, { "epoch": 1.4316905801621957, "grad_norm": 0.11818189693034521, "learning_rate": 6.059069475354586e-06, "loss": 0.0371, "num_tokens": 87504412.0, "step": 1148 }, { "epoch": 1.4329382407985027, "grad_norm": 0.11293724815408349, "learning_rate": 6.0530562660060276e-06, "loss": 0.0365, "num_tokens": 87579968.0, "step": 1149 }, { "epoch": 1.4341859014348097, "grad_norm": 0.12150371365500466, "learning_rate": 6.0470420537992915e-06, "loss": 0.0375, "num_tokens": 87656442.0, "step": 1150 }, { "epoch": 1.4354335620711167, "grad_norm": 0.11127014521567379, "learning_rate": 6.041026849639966e-06, "loss": 0.0361, "num_tokens": 87732219.0, "step": 1151 }, { "epoch": 1.4366812227074237, "grad_norm": 0.12430543357003211, "learning_rate": 6.035010664435434e-06, "loss": 0.0414, "num_tokens": 87807872.0, "step": 1152 }, { "epoch": 1.4379288833437305, "grad_norm": 0.1215550972541805, "learning_rate": 6.0289935090948536e-06, "loss": 0.04, "num_tokens": 87885065.0, "step": 1153 }, { "epoch": 1.4391765439800375, "grad_norm": 0.12018062200786944, "learning_rate": 6.022975394529149e-06, "loss": 0.0369, "num_tokens": 87960894.0, "step": 1154 }, { "epoch": 1.4404242046163445, "grad_norm": 0.11641064053528057, "learning_rate": 6.016956331650984e-06, "loss": 0.0378, "num_tokens": 88037862.0, "step": 1155 }, { "epoch": 1.4416718652526512, "grad_norm": 0.12049199468836105, "learning_rate": 6.010936331374735e-06, "loss": 0.0362, "num_tokens": 88113631.0, "step": 1156 }, { "epoch": 1.4429195258889582, "grad_norm": 0.1117063446883456, "learning_rate": 6.00491540461648e-06, "loss": 0.0369, "num_tokens": 88189354.0, "step": 1157 }, { "epoch": 1.4441671865252652, "grad_norm": 0.1213557530389275, "learning_rate": 5.998893562293986e-06, "loss": 0.0373, "num_tokens": 88265382.0, "step": 1158 }, { "epoch": 1.445414847161572, "grad_norm": 0.12152558671097215, "learning_rate": 5.992870815326667e-06, "loss": 0.0367, "num_tokens": 88342173.0, "step": 1159 }, { "epoch": 1.446662507797879, "grad_norm": 0.11057929090406309, "learning_rate": 5.986847174635586e-06, "loss": 0.0354, "num_tokens": 88418805.0, "step": 1160 }, { "epoch": 1.447910168434186, "grad_norm": 0.12399316687481432, "learning_rate": 5.980822651143426e-06, "loss": 0.0387, "num_tokens": 88494308.0, "step": 1161 }, { "epoch": 1.4491578290704927, "grad_norm": 0.11572790861021126, "learning_rate": 5.9747972557744675e-06, "loss": 0.0381, "num_tokens": 88570633.0, "step": 1162 }, { "epoch": 1.4504054897067997, "grad_norm": 0.11539324409857993, "learning_rate": 5.968770999454572e-06, "loss": 0.0374, "num_tokens": 88647435.0, "step": 1163 }, { "epoch": 1.4516531503431067, "grad_norm": 0.12038386699531019, "learning_rate": 5.962743893111165e-06, "loss": 0.0395, "num_tokens": 88724582.0, "step": 1164 }, { "epoch": 1.4529008109794135, "grad_norm": 0.13615446233989564, "learning_rate": 5.956715947673212e-06, "loss": 0.0377, "num_tokens": 88800743.0, "step": 1165 }, { "epoch": 1.4541484716157205, "grad_norm": 0.11828852500542161, "learning_rate": 5.950687174071201e-06, "loss": 0.0344, "num_tokens": 88875189.0, "step": 1166 }, { "epoch": 1.4553961322520275, "grad_norm": 0.11198051066883659, "learning_rate": 5.944657583237119e-06, "loss": 0.0356, "num_tokens": 88950046.0, "step": 1167 }, { "epoch": 1.4566437928883345, "grad_norm": 0.12304932000403278, "learning_rate": 5.938627186104438e-06, "loss": 0.04, "num_tokens": 89026982.0, "step": 1168 }, { "epoch": 1.4578914535246412, "grad_norm": 0.1287086555877926, "learning_rate": 5.932595993608092e-06, "loss": 0.0399, "num_tokens": 89102664.0, "step": 1169 }, { "epoch": 1.4591391141609482, "grad_norm": 0.13685349466568447, "learning_rate": 5.926564016684453e-06, "loss": 0.0361, "num_tokens": 89178795.0, "step": 1170 }, { "epoch": 1.4603867747972552, "grad_norm": 0.11703659063026274, "learning_rate": 5.920531266271317e-06, "loss": 0.0386, "num_tokens": 89254845.0, "step": 1171 }, { "epoch": 1.4616344354335622, "grad_norm": 0.12135909606309495, "learning_rate": 5.9144977533078885e-06, "loss": 0.0382, "num_tokens": 89332936.0, "step": 1172 }, { "epoch": 1.462882096069869, "grad_norm": 0.12119899763340669, "learning_rate": 5.90846348873475e-06, "loss": 0.0384, "num_tokens": 89409906.0, "step": 1173 }, { "epoch": 1.464129756706176, "grad_norm": 0.1257634185270069, "learning_rate": 5.902428483493845e-06, "loss": 0.0382, "num_tokens": 89486609.0, "step": 1174 }, { "epoch": 1.465377417342483, "grad_norm": 0.1258432085205612, "learning_rate": 5.89639274852846e-06, "loss": 0.0362, "num_tokens": 89562321.0, "step": 1175 }, { "epoch": 1.4666250779787897, "grad_norm": 0.12013512152698796, "learning_rate": 5.890356294783213e-06, "loss": 0.038, "num_tokens": 89638226.0, "step": 1176 }, { "epoch": 1.4678727386150967, "grad_norm": 0.136427771320821, "learning_rate": 5.8843191332040125e-06, "loss": 0.04, "num_tokens": 89714165.0, "step": 1177 }, { "epoch": 1.4691203992514037, "grad_norm": 0.12180732554236498, "learning_rate": 5.878281274738061e-06, "loss": 0.0365, "num_tokens": 89790575.0, "step": 1178 }, { "epoch": 1.4703680598877105, "grad_norm": 0.12196206277349746, "learning_rate": 5.872242730333822e-06, "loss": 0.0366, "num_tokens": 89866156.0, "step": 1179 }, { "epoch": 1.4716157205240175, "grad_norm": 0.11523527520323638, "learning_rate": 5.866203510940998e-06, "loss": 0.039, "num_tokens": 89941867.0, "step": 1180 }, { "epoch": 1.4728633811603244, "grad_norm": 0.1296767966657419, "learning_rate": 5.860163627510521e-06, "loss": 0.0395, "num_tokens": 90019668.0, "step": 1181 }, { "epoch": 1.4741110417966312, "grad_norm": 0.11213362843515545, "learning_rate": 5.854123090994524e-06, "loss": 0.0377, "num_tokens": 90096135.0, "step": 1182 }, { "epoch": 1.4753587024329382, "grad_norm": 0.10840648305544608, "learning_rate": 5.848081912346329e-06, "loss": 0.0362, "num_tokens": 90172349.0, "step": 1183 }, { "epoch": 1.4766063630692452, "grad_norm": 0.11369092296626487, "learning_rate": 5.842040102520416e-06, "loss": 0.0375, "num_tokens": 90249853.0, "step": 1184 }, { "epoch": 1.477854023705552, "grad_norm": 0.12170948408983744, "learning_rate": 5.8359976724724146e-06, "loss": 0.0381, "num_tokens": 90327724.0, "step": 1185 }, { "epoch": 1.479101684341859, "grad_norm": 0.11993171807152257, "learning_rate": 5.829954633159073e-06, "loss": 0.0364, "num_tokens": 90404285.0, "step": 1186 }, { "epoch": 1.480349344978166, "grad_norm": 0.11597109699676006, "learning_rate": 5.823910995538251e-06, "loss": 0.0371, "num_tokens": 90480540.0, "step": 1187 }, { "epoch": 1.481597005614473, "grad_norm": 0.11978029510757789, "learning_rate": 5.8178667705688895e-06, "loss": 0.0367, "num_tokens": 90557248.0, "step": 1188 }, { "epoch": 1.4828446662507797, "grad_norm": 0.121547607943497, "learning_rate": 5.811821969210995e-06, "loss": 0.0365, "num_tokens": 90631870.0, "step": 1189 }, { "epoch": 1.4840923268870867, "grad_norm": 0.11514573974937457, "learning_rate": 5.8057766024256205e-06, "loss": 0.0366, "num_tokens": 90707243.0, "step": 1190 }, { "epoch": 1.4853399875233937, "grad_norm": 0.1186525151878151, "learning_rate": 5.799730681174842e-06, "loss": 0.0361, "num_tokens": 90784104.0, "step": 1191 }, { "epoch": 1.4865876481597007, "grad_norm": 0.11477823661113125, "learning_rate": 5.793684216421744e-06, "loss": 0.0387, "num_tokens": 90859903.0, "step": 1192 }, { "epoch": 1.4878353087960074, "grad_norm": 0.12385914646775886, "learning_rate": 5.787637219130392e-06, "loss": 0.0384, "num_tokens": 90934859.0, "step": 1193 }, { "epoch": 1.4890829694323144, "grad_norm": 0.12832223803870393, "learning_rate": 5.781589700265823e-06, "loss": 0.0369, "num_tokens": 91011413.0, "step": 1194 }, { "epoch": 1.4903306300686214, "grad_norm": 0.11256583038803755, "learning_rate": 5.7755416707940135e-06, "loss": 0.035, "num_tokens": 91086803.0, "step": 1195 }, { "epoch": 1.4915782907049282, "grad_norm": 0.12243730113925984, "learning_rate": 5.76949314168187e-06, "loss": 0.0374, "num_tokens": 91161954.0, "step": 1196 }, { "epoch": 1.4928259513412352, "grad_norm": 0.12768023862843755, "learning_rate": 5.763444123897206e-06, "loss": 0.0377, "num_tokens": 91239788.0, "step": 1197 }, { "epoch": 1.4940736119775422, "grad_norm": 0.11012832453778412, "learning_rate": 5.757394628408716e-06, "loss": 0.0361, "num_tokens": 91314747.0, "step": 1198 }, { "epoch": 1.495321272613849, "grad_norm": 0.12227103519562003, "learning_rate": 5.7513446661859664e-06, "loss": 0.0353, "num_tokens": 91390165.0, "step": 1199 }, { "epoch": 1.496568933250156, "grad_norm": 0.13124194561569905, "learning_rate": 5.7452942481993655e-06, "loss": 0.0388, "num_tokens": 91467820.0, "step": 1200 }, { "epoch": 1.497816593886463, "grad_norm": 0.12306926635997256, "learning_rate": 5.739243385420151e-06, "loss": 0.0344, "num_tokens": 91542554.0, "step": 1201 }, { "epoch": 1.4990642545227697, "grad_norm": 0.10980086429236924, "learning_rate": 5.7331920888203655e-06, "loss": 0.043, "num_tokens": 91618265.0, "step": 1202 }, { "epoch": 1.5003119151590767, "grad_norm": 0.12750053322537247, "learning_rate": 5.727140369372838e-06, "loss": 0.0374, "num_tokens": 91693957.0, "step": 1203 }, { "epoch": 1.5015595757953837, "grad_norm": 0.1350691229685045, "learning_rate": 5.721088238051168e-06, "loss": 0.0423, "num_tokens": 91770974.0, "step": 1204 }, { "epoch": 1.5028072364316905, "grad_norm": 0.12111928534382646, "learning_rate": 5.715035705829696e-06, "loss": 0.0378, "num_tokens": 91848134.0, "step": 1205 }, { "epoch": 1.5040548970679977, "grad_norm": 0.12794780455854038, "learning_rate": 5.708982783683492e-06, "loss": 0.0357, "num_tokens": 91924118.0, "step": 1206 }, { "epoch": 1.5053025577043044, "grad_norm": 0.1084695976853833, "learning_rate": 5.7029294825883365e-06, "loss": 0.0346, "num_tokens": 91999247.0, "step": 1207 }, { "epoch": 1.5065502183406112, "grad_norm": 0.11562042009012471, "learning_rate": 5.696875813520691e-06, "loss": 0.0428, "num_tokens": 92074627.0, "step": 1208 }, { "epoch": 1.5077978789769184, "grad_norm": 0.12053188830493768, "learning_rate": 5.69082178745769e-06, "loss": 0.0364, "num_tokens": 92151320.0, "step": 1209 }, { "epoch": 1.5090455396132252, "grad_norm": 0.11805305700246772, "learning_rate": 5.68476741537711e-06, "loss": 0.0372, "num_tokens": 92226893.0, "step": 1210 }, { "epoch": 1.5102932002495322, "grad_norm": 0.11833239189442073, "learning_rate": 5.678712708257358e-06, "loss": 0.0353, "num_tokens": 92305191.0, "step": 1211 }, { "epoch": 1.5115408608858392, "grad_norm": 0.11372795928925503, "learning_rate": 5.672657677077449e-06, "loss": 0.0366, "num_tokens": 92380981.0, "step": 1212 }, { "epoch": 1.512788521522146, "grad_norm": 0.12034696590824727, "learning_rate": 5.666602332816985e-06, "loss": 0.0355, "num_tokens": 92455797.0, "step": 1213 }, { "epoch": 1.514036182158453, "grad_norm": 0.12788533264540333, "learning_rate": 5.6605466864561344e-06, "loss": 0.0365, "num_tokens": 92532365.0, "step": 1214 }, { "epoch": 1.51528384279476, "grad_norm": 0.11721602859972623, "learning_rate": 5.654490748975615e-06, "loss": 0.0382, "num_tokens": 92608889.0, "step": 1215 }, { "epoch": 1.5165315034310667, "grad_norm": 0.11978374195139166, "learning_rate": 5.648434531356671e-06, "loss": 0.0374, "num_tokens": 92685321.0, "step": 1216 }, { "epoch": 1.5177791640673737, "grad_norm": 0.11893827548661735, "learning_rate": 5.642378044581057e-06, "loss": 0.0365, "num_tokens": 92762518.0, "step": 1217 }, { "epoch": 1.5190268247036807, "grad_norm": 0.1160725816936642, "learning_rate": 5.636321299631015e-06, "loss": 0.037, "num_tokens": 92837516.0, "step": 1218 }, { "epoch": 1.5202744853399874, "grad_norm": 0.12435223239608878, "learning_rate": 5.630264307489251e-06, "loss": 0.0368, "num_tokens": 92913538.0, "step": 1219 }, { "epoch": 1.5215221459762944, "grad_norm": 0.1152960182186084, "learning_rate": 5.624207079138922e-06, "loss": 0.0352, "num_tokens": 92989240.0, "step": 1220 }, { "epoch": 1.5227698066126014, "grad_norm": 0.12002871315414164, "learning_rate": 5.6181496255636195e-06, "loss": 0.0369, "num_tokens": 93065615.0, "step": 1221 }, { "epoch": 1.5240174672489082, "grad_norm": 0.11869764826176993, "learning_rate": 5.612091957747333e-06, "loss": 0.0359, "num_tokens": 93141553.0, "step": 1222 }, { "epoch": 1.5252651278852152, "grad_norm": 0.11533368448603282, "learning_rate": 5.606034086674447e-06, "loss": 0.0377, "num_tokens": 93217211.0, "step": 1223 }, { "epoch": 1.5265127885215222, "grad_norm": 0.12493028735607448, "learning_rate": 5.5999760233297115e-06, "loss": 0.0371, "num_tokens": 93292801.0, "step": 1224 }, { "epoch": 1.527760449157829, "grad_norm": 0.12755055829863948, "learning_rate": 5.593917778698227e-06, "loss": 0.0397, "num_tokens": 93369348.0, "step": 1225 }, { "epoch": 1.5290081097941361, "grad_norm": 0.1197732646050575, "learning_rate": 5.5878593637654226e-06, "loss": 0.0381, "num_tokens": 93445929.0, "step": 1226 }, { "epoch": 1.530255770430443, "grad_norm": 0.11713177274161533, "learning_rate": 5.581800789517036e-06, "loss": 0.0368, "num_tokens": 93523595.0, "step": 1227 }, { "epoch": 1.5315034310667497, "grad_norm": 0.11452856294775239, "learning_rate": 5.5757420669390925e-06, "loss": 0.0367, "num_tokens": 93599007.0, "step": 1228 }, { "epoch": 1.532751091703057, "grad_norm": 0.11955913139155055, "learning_rate": 5.5696832070178885e-06, "loss": 0.0384, "num_tokens": 93674617.0, "step": 1229 }, { "epoch": 1.5339987523393637, "grad_norm": 0.11445496167989216, "learning_rate": 5.563624220739969e-06, "loss": 0.0375, "num_tokens": 93750264.0, "step": 1230 }, { "epoch": 1.5352464129756707, "grad_norm": 0.1134416374479405, "learning_rate": 5.557565119092106e-06, "loss": 0.0366, "num_tokens": 93825604.0, "step": 1231 }, { "epoch": 1.5364940736119777, "grad_norm": 0.11119872151750049, "learning_rate": 5.551505913061281e-06, "loss": 0.0351, "num_tokens": 93900858.0, "step": 1232 }, { "epoch": 1.5377417342482844, "grad_norm": 0.12334463348947557, "learning_rate": 5.54544661363467e-06, "loss": 0.0389, "num_tokens": 93978548.0, "step": 1233 }, { "epoch": 1.5389893948845914, "grad_norm": 0.10909156980739217, "learning_rate": 5.53938723179961e-06, "loss": 0.0368, "num_tokens": 94054145.0, "step": 1234 }, { "epoch": 1.5402370555208984, "grad_norm": 0.12443155582203641, "learning_rate": 5.533327778543593e-06, "loss": 0.0388, "num_tokens": 94129620.0, "step": 1235 }, { "epoch": 1.5414847161572052, "grad_norm": 0.11000189874354667, "learning_rate": 5.527268264854241e-06, "loss": 0.0367, "num_tokens": 94207087.0, "step": 1236 }, { "epoch": 1.5427323767935122, "grad_norm": 0.13284023214221294, "learning_rate": 5.521208701719284e-06, "loss": 0.0394, "num_tokens": 94283829.0, "step": 1237 }, { "epoch": 1.5439800374298192, "grad_norm": 0.11877957335168572, "learning_rate": 5.515149100126539e-06, "loss": 0.0384, "num_tokens": 94360479.0, "step": 1238 }, { "epoch": 1.545227698066126, "grad_norm": 0.11498644957104806, "learning_rate": 5.509089471063897e-06, "loss": 0.039, "num_tokens": 94439077.0, "step": 1239 }, { "epoch": 1.546475358702433, "grad_norm": 0.1257089848903344, "learning_rate": 5.503029825519296e-06, "loss": 0.0374, "num_tokens": 94515106.0, "step": 1240 }, { "epoch": 1.54772301933874, "grad_norm": 0.11451851398012343, "learning_rate": 5.496970174480706e-06, "loss": 0.0384, "num_tokens": 94591702.0, "step": 1241 }, { "epoch": 1.5489706799750467, "grad_norm": 0.14041696423500313, "learning_rate": 5.4909105289361055e-06, "loss": 0.056, "num_tokens": 94668695.0, "step": 1242 }, { "epoch": 1.5502183406113537, "grad_norm": 0.12498370183180903, "learning_rate": 5.4848508998734626e-06, "loss": 0.039, "num_tokens": 94744775.0, "step": 1243 }, { "epoch": 1.5514660012476607, "grad_norm": 0.1315344410628175, "learning_rate": 5.478791298280719e-06, "loss": 0.0362, "num_tokens": 94819820.0, "step": 1244 }, { "epoch": 1.5527136618839674, "grad_norm": 0.11033302554173556, "learning_rate": 5.47273173514576e-06, "loss": 0.0354, "num_tokens": 94895512.0, "step": 1245 }, { "epoch": 1.5539613225202746, "grad_norm": 0.1110224016665635, "learning_rate": 5.466672221456408e-06, "loss": 0.0376, "num_tokens": 94971206.0, "step": 1246 }, { "epoch": 1.5552089831565814, "grad_norm": 0.11225773695191615, "learning_rate": 5.4606127682003915e-06, "loss": 0.0422, "num_tokens": 95047311.0, "step": 1247 }, { "epoch": 1.5564566437928882, "grad_norm": 0.11231998616938674, "learning_rate": 5.454553386365333e-06, "loss": 0.0352, "num_tokens": 95123071.0, "step": 1248 }, { "epoch": 1.5577043044291954, "grad_norm": 0.11468464677858389, "learning_rate": 5.44849408693872e-06, "loss": 0.0381, "num_tokens": 95200046.0, "step": 1249 }, { "epoch": 1.5589519650655022, "grad_norm": 0.11582658442663075, "learning_rate": 5.4424348809078974e-06, "loss": 0.0365, "num_tokens": 95275311.0, "step": 1250 }, { "epoch": 1.5601996257018091, "grad_norm": 0.11669328477123285, "learning_rate": 5.436375779260034e-06, "loss": 0.0348, "num_tokens": 95351246.0, "step": 1251 }, { "epoch": 1.5614472863381161, "grad_norm": 0.11537314709151678, "learning_rate": 5.430316792982112e-06, "loss": 0.0373, "num_tokens": 95426887.0, "step": 1252 }, { "epoch": 1.562694946974423, "grad_norm": 0.11989449996947185, "learning_rate": 5.424257933060908e-06, "loss": 0.0346, "num_tokens": 95502541.0, "step": 1253 }, { "epoch": 1.56394260761073, "grad_norm": 0.1142438417076298, "learning_rate": 5.418199210482965e-06, "loss": 0.0397, "num_tokens": 95580052.0, "step": 1254 }, { "epoch": 1.5651902682470369, "grad_norm": 0.11351778634936657, "learning_rate": 5.412140636234579e-06, "loss": 0.037, "num_tokens": 95655811.0, "step": 1255 }, { "epoch": 1.5664379288833437, "grad_norm": 0.11591053947282466, "learning_rate": 5.4060822213017745e-06, "loss": 0.0374, "num_tokens": 95732008.0, "step": 1256 }, { "epoch": 1.5676855895196506, "grad_norm": 0.11163580600728568, "learning_rate": 5.400023976670291e-06, "loss": 0.0388, "num_tokens": 95808735.0, "step": 1257 }, { "epoch": 1.5689332501559576, "grad_norm": 0.11912649417015345, "learning_rate": 5.393965913325555e-06, "loss": 0.0376, "num_tokens": 95884978.0, "step": 1258 }, { "epoch": 1.5701809107922644, "grad_norm": 0.12010176790938462, "learning_rate": 5.387908042252667e-06, "loss": 0.0364, "num_tokens": 95962331.0, "step": 1259 }, { "epoch": 1.5714285714285714, "grad_norm": 0.11404213718160137, "learning_rate": 5.381850374436383e-06, "loss": 0.0375, "num_tokens": 96039168.0, "step": 1260 }, { "epoch": 1.5726762320648784, "grad_norm": 0.11763900222873848, "learning_rate": 5.3757929208610784e-06, "loss": 0.0382, "num_tokens": 96115308.0, "step": 1261 }, { "epoch": 1.5739238927011852, "grad_norm": 0.1099813965721135, "learning_rate": 5.3697356925107514e-06, "loss": 0.0359, "num_tokens": 96191451.0, "step": 1262 }, { "epoch": 1.5751715533374921, "grad_norm": 0.1092899473174701, "learning_rate": 5.363678700368987e-06, "loss": 0.0351, "num_tokens": 96267849.0, "step": 1263 }, { "epoch": 1.5764192139737991, "grad_norm": 0.11219756592113182, "learning_rate": 5.3576219554189445e-06, "loss": 0.0392, "num_tokens": 96343518.0, "step": 1264 }, { "epoch": 1.577666874610106, "grad_norm": 0.11513865500406484, "learning_rate": 5.35156546864333e-06, "loss": 0.0359, "num_tokens": 96419685.0, "step": 1265 }, { "epoch": 1.5789145352464131, "grad_norm": 0.11755138786676107, "learning_rate": 5.345509251024387e-06, "loss": 0.0365, "num_tokens": 96494987.0, "step": 1266 }, { "epoch": 1.5801621958827199, "grad_norm": 0.10723917441034322, "learning_rate": 5.339453313543868e-06, "loss": 0.037, "num_tokens": 96571590.0, "step": 1267 }, { "epoch": 1.5814098565190269, "grad_norm": 0.11831131657005481, "learning_rate": 5.3333976671830165e-06, "loss": 0.0376, "num_tokens": 96648687.0, "step": 1268 }, { "epoch": 1.5826575171553339, "grad_norm": 0.12060744557345109, "learning_rate": 5.327342322922553e-06, "loss": 0.0377, "num_tokens": 96726743.0, "step": 1269 }, { "epoch": 1.5839051777916406, "grad_norm": 0.10568069351830238, "learning_rate": 5.321287291742645e-06, "loss": 0.034, "num_tokens": 96801933.0, "step": 1270 }, { "epoch": 1.5851528384279476, "grad_norm": 0.1138618386162909, "learning_rate": 5.315232584622893e-06, "loss": 0.0354, "num_tokens": 96876827.0, "step": 1271 }, { "epoch": 1.5864004990642546, "grad_norm": 0.11948278608459084, "learning_rate": 5.309178212542313e-06, "loss": 0.0349, "num_tokens": 96951983.0, "step": 1272 }, { "epoch": 1.5876481597005614, "grad_norm": 0.11306709736045806, "learning_rate": 5.303124186479309e-06, "loss": 0.0355, "num_tokens": 97027382.0, "step": 1273 }, { "epoch": 1.5888958203368684, "grad_norm": 0.11536595148537067, "learning_rate": 5.297070517411664e-06, "loss": 0.0354, "num_tokens": 97102461.0, "step": 1274 }, { "epoch": 1.5901434809731754, "grad_norm": 0.11243412386645973, "learning_rate": 5.2910172163165096e-06, "loss": 0.0375, "num_tokens": 97179120.0, "step": 1275 }, { "epoch": 1.5913911416094821, "grad_norm": 0.12089055030621573, "learning_rate": 5.284964294170306e-06, "loss": 0.0367, "num_tokens": 97255298.0, "step": 1276 }, { "epoch": 1.5926388022457891, "grad_norm": 0.12214280046521081, "learning_rate": 5.278911761948834e-06, "loss": 0.037, "num_tokens": 97331060.0, "step": 1277 }, { "epoch": 1.5938864628820961, "grad_norm": 0.1062549969079459, "learning_rate": 5.272859630627164e-06, "loss": 0.0352, "num_tokens": 97406110.0, "step": 1278 }, { "epoch": 1.595134123518403, "grad_norm": 0.12246112884589401, "learning_rate": 5.266807911179638e-06, "loss": 0.039, "num_tokens": 97483449.0, "step": 1279 }, { "epoch": 1.5963817841547099, "grad_norm": 0.11103361608657275, "learning_rate": 5.260756614579851e-06, "loss": 0.0372, "num_tokens": 97560311.0, "step": 1280 }, { "epoch": 1.5976294447910169, "grad_norm": 0.135106005974476, "learning_rate": 5.254705751800636e-06, "loss": 0.0391, "num_tokens": 97638070.0, "step": 1281 }, { "epoch": 1.5988771054273236, "grad_norm": 0.11795350414372122, "learning_rate": 5.248655333814036e-06, "loss": 0.0367, "num_tokens": 97714118.0, "step": 1282 }, { "epoch": 1.6001247660636309, "grad_norm": 0.1277071179872957, "learning_rate": 5.242605371591286e-06, "loss": 0.0378, "num_tokens": 97790110.0, "step": 1283 }, { "epoch": 1.6013724266999376, "grad_norm": 0.10832036457269213, "learning_rate": 5.236555876102797e-06, "loss": 0.0368, "num_tokens": 97865186.0, "step": 1284 }, { "epoch": 1.6026200873362444, "grad_norm": 0.12298238402683899, "learning_rate": 5.2305068583181314e-06, "loss": 0.0359, "num_tokens": 97941587.0, "step": 1285 }, { "epoch": 1.6038677479725516, "grad_norm": 0.11205375865529897, "learning_rate": 5.2244583292059896e-06, "loss": 0.0351, "num_tokens": 98017073.0, "step": 1286 }, { "epoch": 1.6051154086088584, "grad_norm": 0.11398489887533124, "learning_rate": 5.218410299734181e-06, "loss": 0.0361, "num_tokens": 98093084.0, "step": 1287 }, { "epoch": 1.6063630692451654, "grad_norm": 0.11331925773487742, "learning_rate": 5.2123627808696084e-06, "loss": 0.0378, "num_tokens": 98169702.0, "step": 1288 }, { "epoch": 1.6076107298814724, "grad_norm": 0.12754453783748512, "learning_rate": 5.206315783578258e-06, "loss": 0.0379, "num_tokens": 98246352.0, "step": 1289 }, { "epoch": 1.6088583905177791, "grad_norm": 0.11695300113526581, "learning_rate": 5.20026931882516e-06, "loss": 0.0351, "num_tokens": 98321908.0, "step": 1290 }, { "epoch": 1.6101060511540861, "grad_norm": 0.10619866825878661, "learning_rate": 5.194223397574381e-06, "loss": 0.034, "num_tokens": 98398421.0, "step": 1291 }, { "epoch": 1.611353711790393, "grad_norm": 0.11601433765217581, "learning_rate": 5.188178030789008e-06, "loss": 0.0375, "num_tokens": 98475418.0, "step": 1292 }, { "epoch": 1.6126013724266999, "grad_norm": 0.11176870281043169, "learning_rate": 5.1821332294311136e-06, "loss": 0.035, "num_tokens": 98551489.0, "step": 1293 }, { "epoch": 1.6138490330630069, "grad_norm": 0.12198142407040487, "learning_rate": 5.176089004461752e-06, "loss": 0.0376, "num_tokens": 98628378.0, "step": 1294 }, { "epoch": 1.6150966936993139, "grad_norm": 0.12752726575125428, "learning_rate": 5.170045366840929e-06, "loss": 0.0384, "num_tokens": 98704929.0, "step": 1295 }, { "epoch": 1.6163443543356206, "grad_norm": 0.12085444832378747, "learning_rate": 5.164002327527588e-06, "loss": 0.04, "num_tokens": 98782920.0, "step": 1296 }, { "epoch": 1.6175920149719276, "grad_norm": 0.13632096954307515, "learning_rate": 5.157959897479587e-06, "loss": 0.0373, "num_tokens": 98858537.0, "step": 1297 }, { "epoch": 1.6188396756082346, "grad_norm": 0.12306314338751427, "learning_rate": 5.151918087653672e-06, "loss": 0.0361, "num_tokens": 98934085.0, "step": 1298 }, { "epoch": 1.6200873362445414, "grad_norm": 0.1403645125183729, "learning_rate": 5.145876909005477e-06, "loss": 0.034, "num_tokens": 99009665.0, "step": 1299 }, { "epoch": 1.6213349968808484, "grad_norm": 0.1144609737580006, "learning_rate": 5.139836372489481e-06, "loss": 0.0341, "num_tokens": 99084393.0, "step": 1300 }, { "epoch": 1.6225826575171554, "grad_norm": 0.1073101633798548, "learning_rate": 5.133796489059005e-06, "loss": 0.0414, "num_tokens": 99163283.0, "step": 1301 }, { "epoch": 1.6238303181534621, "grad_norm": 0.12155196636230708, "learning_rate": 5.1277572696661806e-06, "loss": 0.0359, "num_tokens": 99240403.0, "step": 1302 }, { "epoch": 1.6250779787897693, "grad_norm": 0.11193137861888704, "learning_rate": 5.12171872526194e-06, "loss": 0.0391, "num_tokens": 99316331.0, "step": 1303 }, { "epoch": 1.626325639426076, "grad_norm": 0.1297846009862795, "learning_rate": 5.115680866795989e-06, "loss": 0.0386, "num_tokens": 99392413.0, "step": 1304 }, { "epoch": 1.6275733000623829, "grad_norm": 0.12104786113120562, "learning_rate": 5.109643705216789e-06, "loss": 0.0387, "num_tokens": 99468998.0, "step": 1305 }, { "epoch": 1.62882096069869, "grad_norm": 0.1259339680092441, "learning_rate": 5.103607251471541e-06, "loss": 0.0361, "num_tokens": 99544604.0, "step": 1306 }, { "epoch": 1.6300686213349969, "grad_norm": 0.11907192744895645, "learning_rate": 5.097571516506158e-06, "loss": 0.038, "num_tokens": 99621509.0, "step": 1307 }, { "epoch": 1.6313162819713038, "grad_norm": 0.12076333993471534, "learning_rate": 5.091536511265253e-06, "loss": 0.0382, "num_tokens": 99698692.0, "step": 1308 }, { "epoch": 1.6325639426076108, "grad_norm": 0.11339455289910984, "learning_rate": 5.085502246692111e-06, "loss": 0.0376, "num_tokens": 99775293.0, "step": 1309 }, { "epoch": 1.6338116032439176, "grad_norm": 0.1264608232290441, "learning_rate": 5.079468733728684e-06, "loss": 0.0364, "num_tokens": 99852057.0, "step": 1310 }, { "epoch": 1.6350592638802246, "grad_norm": 0.1091582073216211, "learning_rate": 5.07343598331555e-06, "loss": 0.0362, "num_tokens": 99928654.0, "step": 1311 }, { "epoch": 1.6363069245165316, "grad_norm": 0.11123927985290005, "learning_rate": 5.0674040063919114e-06, "loss": 0.0349, "num_tokens": 100003903.0, "step": 1312 }, { "epoch": 1.6375545851528384, "grad_norm": 0.11986584833501389, "learning_rate": 5.0613728138955644e-06, "loss": 0.0355, "num_tokens": 100079727.0, "step": 1313 }, { "epoch": 1.6388022457891454, "grad_norm": 0.11450610998058318, "learning_rate": 5.055342416762883e-06, "loss": 0.0378, "num_tokens": 100155922.0, "step": 1314 }, { "epoch": 1.6400499064254523, "grad_norm": 0.11831806865766152, "learning_rate": 5.0493128259288025e-06, "loss": 0.0371, "num_tokens": 100231834.0, "step": 1315 }, { "epoch": 1.641297567061759, "grad_norm": 0.12149075339785302, "learning_rate": 5.043284052326789e-06, "loss": 0.0368, "num_tokens": 100307612.0, "step": 1316 }, { "epoch": 1.642545227698066, "grad_norm": 0.12082559817872005, "learning_rate": 5.037256106888837e-06, "loss": 0.0365, "num_tokens": 100386062.0, "step": 1317 }, { "epoch": 1.643792888334373, "grad_norm": 0.12561599057345044, "learning_rate": 5.03122900054543e-06, "loss": 0.0355, "num_tokens": 100462184.0, "step": 1318 }, { "epoch": 1.6450405489706799, "grad_norm": 0.11883116105714127, "learning_rate": 5.025202744225535e-06, "loss": 0.0346, "num_tokens": 100538383.0, "step": 1319 }, { "epoch": 1.6462882096069869, "grad_norm": 0.12249245203298004, "learning_rate": 5.019177348856576e-06, "loss": 0.0371, "num_tokens": 100616079.0, "step": 1320 }, { "epoch": 1.6475358702432938, "grad_norm": 0.11058943111150786, "learning_rate": 5.013152825364416e-06, "loss": 0.0359, "num_tokens": 100692007.0, "step": 1321 }, { "epoch": 1.6487835308796006, "grad_norm": 0.12415625273211772, "learning_rate": 5.007129184673335e-06, "loss": 0.0371, "num_tokens": 100768530.0, "step": 1322 }, { "epoch": 1.6500311915159078, "grad_norm": 0.1105120072809638, "learning_rate": 5.001106437706016e-06, "loss": 0.035, "num_tokens": 100843567.0, "step": 1323 }, { "epoch": 1.6512788521522146, "grad_norm": 0.11777793444144743, "learning_rate": 4.99508459538352e-06, "loss": 0.038, "num_tokens": 100919975.0, "step": 1324 }, { "epoch": 1.6525265127885214, "grad_norm": 0.11792502762787642, "learning_rate": 4.989063668625267e-06, "loss": 0.0387, "num_tokens": 100996420.0, "step": 1325 }, { "epoch": 1.6537741734248286, "grad_norm": 0.12674786529957854, "learning_rate": 4.983043668349018e-06, "loss": 0.0366, "num_tokens": 101072591.0, "step": 1326 }, { "epoch": 1.6550218340611353, "grad_norm": 0.11470804032572889, "learning_rate": 4.977024605470851e-06, "loss": 0.0399, "num_tokens": 101149731.0, "step": 1327 }, { "epoch": 1.6562694946974423, "grad_norm": 0.1231867095999115, "learning_rate": 4.971006490905148e-06, "loss": 0.0391, "num_tokens": 101226108.0, "step": 1328 }, { "epoch": 1.6575171553337493, "grad_norm": 0.11568699365185363, "learning_rate": 4.964989335564571e-06, "loss": 0.0364, "num_tokens": 101301980.0, "step": 1329 }, { "epoch": 1.658764815970056, "grad_norm": 0.11488559020695345, "learning_rate": 4.958973150360034e-06, "loss": 0.038, "num_tokens": 101377961.0, "step": 1330 }, { "epoch": 1.660012476606363, "grad_norm": 0.10528800902274923, "learning_rate": 4.952957946200709e-06, "loss": 0.0334, "num_tokens": 101452942.0, "step": 1331 }, { "epoch": 1.66126013724267, "grad_norm": 0.11814463514870395, "learning_rate": 4.946943733993974e-06, "loss": 0.0369, "num_tokens": 101528775.0, "step": 1332 }, { "epoch": 1.6625077978789768, "grad_norm": 0.11188542272715878, "learning_rate": 4.940930524645414e-06, "loss": 0.0398, "num_tokens": 101605154.0, "step": 1333 }, { "epoch": 1.6637554585152838, "grad_norm": 0.13041645138750732, "learning_rate": 4.934918329058798e-06, "loss": 0.0383, "num_tokens": 101682190.0, "step": 1334 }, { "epoch": 1.6650031191515908, "grad_norm": 0.11426497483003493, "learning_rate": 4.928907158136049e-06, "loss": 0.0365, "num_tokens": 101757739.0, "step": 1335 }, { "epoch": 1.6662507797878976, "grad_norm": 0.11568107261393834, "learning_rate": 4.922897022777241e-06, "loss": 0.0363, "num_tokens": 101834357.0, "step": 1336 }, { "epoch": 1.6674984404242046, "grad_norm": 0.11574082195970792, "learning_rate": 4.916887933880562e-06, "loss": 0.0355, "num_tokens": 101909908.0, "step": 1337 }, { "epoch": 1.6687461010605116, "grad_norm": 0.1172235586101275, "learning_rate": 4.910879902342309e-06, "loss": 0.0364, "num_tokens": 101987781.0, "step": 1338 }, { "epoch": 1.6699937616968183, "grad_norm": 0.13798170432817686, "learning_rate": 4.904872939056859e-06, "loss": 0.0378, "num_tokens": 102063911.0, "step": 1339 }, { "epoch": 1.6712414223331253, "grad_norm": 0.1212513945751985, "learning_rate": 4.898867054916655e-06, "loss": 0.0384, "num_tokens": 102141336.0, "step": 1340 }, { "epoch": 1.6724890829694323, "grad_norm": 0.12928789262202314, "learning_rate": 4.892862260812174e-06, "loss": 0.0345, "num_tokens": 102218148.0, "step": 1341 }, { "epoch": 1.673736743605739, "grad_norm": 0.11821647712469623, "learning_rate": 4.886858567631927e-06, "loss": 0.039, "num_tokens": 102294670.0, "step": 1342 }, { "epoch": 1.6749844042420463, "grad_norm": 0.1192083959471349, "learning_rate": 4.880855986262424e-06, "loss": 0.0353, "num_tokens": 102371793.0, "step": 1343 }, { "epoch": 1.676232064878353, "grad_norm": 0.11043237689952129, "learning_rate": 4.874854527588159e-06, "loss": 0.0357, "num_tokens": 102446940.0, "step": 1344 }, { "epoch": 1.6774797255146598, "grad_norm": 0.12091130350927373, "learning_rate": 4.868854202491587e-06, "loss": 0.0365, "num_tokens": 102522517.0, "step": 1345 }, { "epoch": 1.678727386150967, "grad_norm": 0.11717551247947147, "learning_rate": 4.862855021853117e-06, "loss": 0.0364, "num_tokens": 102599001.0, "step": 1346 }, { "epoch": 1.6799750467872738, "grad_norm": 0.11055524190241156, "learning_rate": 4.856856996551074e-06, "loss": 0.0373, "num_tokens": 102675717.0, "step": 1347 }, { "epoch": 1.6812227074235808, "grad_norm": 0.12424488352828751, "learning_rate": 4.850860137461691e-06, "loss": 0.0373, "num_tokens": 102751593.0, "step": 1348 }, { "epoch": 1.6824703680598878, "grad_norm": 0.11204955443556182, "learning_rate": 4.844864455459085e-06, "loss": 0.0362, "num_tokens": 102827243.0, "step": 1349 }, { "epoch": 1.6837180286961946, "grad_norm": 0.11865902950722064, "learning_rate": 4.83886996141524e-06, "loss": 0.0359, "num_tokens": 102902365.0, "step": 1350 }, { "epoch": 1.6849656893325016, "grad_norm": 0.12365165052998159, "learning_rate": 4.8328766661999885e-06, "loss": 0.0367, "num_tokens": 102977634.0, "step": 1351 }, { "epoch": 1.6862133499688086, "grad_norm": 0.11769618162902643, "learning_rate": 4.826884580680981e-06, "loss": 0.036, "num_tokens": 103053963.0, "step": 1352 }, { "epoch": 1.6874610106051153, "grad_norm": 0.11005279715923194, "learning_rate": 4.8208937157236855e-06, "loss": 0.0363, "num_tokens": 103129617.0, "step": 1353 }, { "epoch": 1.6887086712414223, "grad_norm": 0.12190292059359654, "learning_rate": 4.814904082191349e-06, "loss": 0.0376, "num_tokens": 103205831.0, "step": 1354 }, { "epoch": 1.6899563318777293, "grad_norm": 0.12379736841695456, "learning_rate": 4.8089156909449845e-06, "loss": 0.0364, "num_tokens": 103281361.0, "step": 1355 }, { "epoch": 1.691203992514036, "grad_norm": 0.12172911071756917, "learning_rate": 4.802928552843358e-06, "loss": 0.0479, "num_tokens": 103359453.0, "step": 1356 }, { "epoch": 1.692451653150343, "grad_norm": 0.11458592687454641, "learning_rate": 4.79694267874296e-06, "loss": 0.0404, "num_tokens": 103436739.0, "step": 1357 }, { "epoch": 1.69369931378665, "grad_norm": 0.12664265435736952, "learning_rate": 4.790958079497991e-06, "loss": 0.0399, "num_tokens": 103513765.0, "step": 1358 }, { "epoch": 1.6949469744229568, "grad_norm": 0.1231061961741426, "learning_rate": 4.784974765960335e-06, "loss": 0.0378, "num_tokens": 103589063.0, "step": 1359 }, { "epoch": 1.696194635059264, "grad_norm": 0.11331377001939717, "learning_rate": 4.77899274897955e-06, "loss": 0.0359, "num_tokens": 103665303.0, "step": 1360 }, { "epoch": 1.6974422956955708, "grad_norm": 0.116217295144265, "learning_rate": 4.773012039402841e-06, "loss": 0.0351, "num_tokens": 103741004.0, "step": 1361 }, { "epoch": 1.6986899563318776, "grad_norm": 0.1176520133320377, "learning_rate": 4.767032648075043e-06, "loss": 0.0383, "num_tokens": 103817035.0, "step": 1362 }, { "epoch": 1.6999376169681848, "grad_norm": 0.12185429747688767, "learning_rate": 4.761054585838599e-06, "loss": 0.0372, "num_tokens": 103893856.0, "step": 1363 }, { "epoch": 1.7011852776044916, "grad_norm": 0.12816254491207327, "learning_rate": 4.755077863533541e-06, "loss": 0.0351, "num_tokens": 103970137.0, "step": 1364 }, { "epoch": 1.7024329382407986, "grad_norm": 0.11787000881995827, "learning_rate": 4.749102491997476e-06, "loss": 0.037, "num_tokens": 104046396.0, "step": 1365 }, { "epoch": 1.7036805988771055, "grad_norm": 0.10708489924117709, "learning_rate": 4.743128482065555e-06, "loss": 0.0351, "num_tokens": 104122836.0, "step": 1366 }, { "epoch": 1.7049282595134123, "grad_norm": 0.12224915882356628, "learning_rate": 4.737155844570468e-06, "loss": 0.0387, "num_tokens": 104200752.0, "step": 1367 }, { "epoch": 1.7061759201497193, "grad_norm": 0.11927797892198116, "learning_rate": 4.7311845903424104e-06, "loss": 0.0379, "num_tokens": 104276463.0, "step": 1368 }, { "epoch": 1.7074235807860263, "grad_norm": 0.11933248557644462, "learning_rate": 4.725214730209069e-06, "loss": 0.0376, "num_tokens": 104354190.0, "step": 1369 }, { "epoch": 1.708671241422333, "grad_norm": 0.11891032616900842, "learning_rate": 4.719246274995607e-06, "loss": 0.0339, "num_tokens": 104429464.0, "step": 1370 }, { "epoch": 1.70991890205864, "grad_norm": 0.11202623027798712, "learning_rate": 4.713279235524637e-06, "loss": 0.0377, "num_tokens": 104505684.0, "step": 1371 }, { "epoch": 1.711166562694947, "grad_norm": 0.11951117716097824, "learning_rate": 4.707313622616205e-06, "loss": 0.0354, "num_tokens": 104581004.0, "step": 1372 }, { "epoch": 1.7124142233312538, "grad_norm": 0.11819752380067314, "learning_rate": 4.701349447087769e-06, "loss": 0.0384, "num_tokens": 104658361.0, "step": 1373 }, { "epoch": 1.7136618839675608, "grad_norm": 0.11366388384713198, "learning_rate": 4.695386719754184e-06, "loss": 0.037, "num_tokens": 104734938.0, "step": 1374 }, { "epoch": 1.7149095446038678, "grad_norm": 0.12740314308973466, "learning_rate": 4.689425451427677e-06, "loss": 0.0356, "num_tokens": 104810696.0, "step": 1375 }, { "epoch": 1.7161572052401746, "grad_norm": 0.11222467035026419, "learning_rate": 4.683465652917828e-06, "loss": 0.0362, "num_tokens": 104886709.0, "step": 1376 }, { "epoch": 1.7174048658764816, "grad_norm": 0.11092206647603566, "learning_rate": 4.677507335031555e-06, "loss": 0.0364, "num_tokens": 104962238.0, "step": 1377 }, { "epoch": 1.7186525265127885, "grad_norm": 0.1160369465140294, "learning_rate": 4.671550508573087e-06, "loss": 0.0359, "num_tokens": 105037646.0, "step": 1378 }, { "epoch": 1.7199001871490953, "grad_norm": 0.11495558179367521, "learning_rate": 4.6655951843439514e-06, "loss": 0.0375, "num_tokens": 105113774.0, "step": 1379 }, { "epoch": 1.7211478477854025, "grad_norm": 0.12435135754767197, "learning_rate": 4.659641373142953e-06, "loss": 0.0386, "num_tokens": 105190747.0, "step": 1380 }, { "epoch": 1.7223955084217093, "grad_norm": 0.11693604602399771, "learning_rate": 4.653689085766147e-06, "loss": 0.0341, "num_tokens": 105267912.0, "step": 1381 }, { "epoch": 1.723643169058016, "grad_norm": 0.10558552346143842, "learning_rate": 4.6477383330068335e-06, "loss": 0.0367, "num_tokens": 105343487.0, "step": 1382 }, { "epoch": 1.7248908296943233, "grad_norm": 0.12228098738763672, "learning_rate": 4.641789125655526e-06, "loss": 0.0377, "num_tokens": 105420088.0, "step": 1383 }, { "epoch": 1.72613849033063, "grad_norm": 0.11328598660616651, "learning_rate": 4.6358414744999324e-06, "loss": 0.038, "num_tokens": 105496352.0, "step": 1384 }, { "epoch": 1.727386150966937, "grad_norm": 0.12964544697403055, "learning_rate": 4.6298953903249455e-06, "loss": 0.0394, "num_tokens": 105573063.0, "step": 1385 }, { "epoch": 1.728633811603244, "grad_norm": 0.1238931831498432, "learning_rate": 4.623950883912609e-06, "loss": 0.0367, "num_tokens": 105649475.0, "step": 1386 }, { "epoch": 1.7298814722395508, "grad_norm": 0.13042704795491192, "learning_rate": 4.618007966042114e-06, "loss": 0.0353, "num_tokens": 105725225.0, "step": 1387 }, { "epoch": 1.7311291328758578, "grad_norm": 0.11540798304998945, "learning_rate": 4.612066647489762e-06, "loss": 0.0346, "num_tokens": 105799845.0, "step": 1388 }, { "epoch": 1.7323767935121648, "grad_norm": 0.10019808616223308, "learning_rate": 4.606126939028965e-06, "loss": 0.0352, "num_tokens": 105875906.0, "step": 1389 }, { "epoch": 1.7336244541484715, "grad_norm": 0.11294492281764848, "learning_rate": 4.600188851430206e-06, "loss": 0.0339, "num_tokens": 105951201.0, "step": 1390 }, { "epoch": 1.7348721147847785, "grad_norm": 0.11156801308172176, "learning_rate": 4.594252395461036e-06, "loss": 0.0361, "num_tokens": 106027130.0, "step": 1391 }, { "epoch": 1.7361197754210855, "grad_norm": 0.11665884377063734, "learning_rate": 4.588317581886041e-06, "loss": 0.0362, "num_tokens": 106103346.0, "step": 1392 }, { "epoch": 1.7373674360573923, "grad_norm": 0.11454295291582704, "learning_rate": 4.5823844214668326e-06, "loss": 0.0381, "num_tokens": 106179489.0, "step": 1393 }, { "epoch": 1.7386150966936993, "grad_norm": 0.11318625067239595, "learning_rate": 4.576452924962024e-06, "loss": 0.0356, "num_tokens": 106255533.0, "step": 1394 }, { "epoch": 1.7398627573300063, "grad_norm": 0.11952096031253687, "learning_rate": 4.570523103127209e-06, "loss": 0.0369, "num_tokens": 106330931.0, "step": 1395 }, { "epoch": 1.741110417966313, "grad_norm": 0.12371072052697842, "learning_rate": 4.564594966714952e-06, "loss": 0.0345, "num_tokens": 106406216.0, "step": 1396 }, { "epoch": 1.74235807860262, "grad_norm": 0.10756413875277834, "learning_rate": 4.558668526474751e-06, "loss": 0.0369, "num_tokens": 106482048.0, "step": 1397 }, { "epoch": 1.743605739238927, "grad_norm": 0.10973946525613622, "learning_rate": 4.552743793153037e-06, "loss": 0.0374, "num_tokens": 106558458.0, "step": 1398 }, { "epoch": 1.7448533998752338, "grad_norm": 0.11554088759005555, "learning_rate": 4.5468207774931414e-06, "loss": 0.0371, "num_tokens": 106634391.0, "step": 1399 }, { "epoch": 1.746101060511541, "grad_norm": 0.11857009926153536, "learning_rate": 4.540899490235282e-06, "loss": 0.0369, "num_tokens": 106711587.0, "step": 1400 }, { "epoch": 1.7473487211478478, "grad_norm": 0.11113088782412625, "learning_rate": 4.534979942116542e-06, "loss": 0.0363, "num_tokens": 106787790.0, "step": 1401 }, { "epoch": 1.7485963817841546, "grad_norm": 0.11521710213865818, "learning_rate": 4.529062143870849e-06, "loss": 0.0363, "num_tokens": 106863918.0, "step": 1402 }, { "epoch": 1.7498440424204618, "grad_norm": 0.12355721482825922, "learning_rate": 4.5231461062289624e-06, "loss": 0.0352, "num_tokens": 106940044.0, "step": 1403 }, { "epoch": 1.7510917030567685, "grad_norm": 0.11027027150618844, "learning_rate": 4.5172318399184485e-06, "loss": 0.0371, "num_tokens": 107016503.0, "step": 1404 }, { "epoch": 1.7523393636930755, "grad_norm": 0.11557790009746331, "learning_rate": 4.511319355663657e-06, "loss": 0.0373, "num_tokens": 107093464.0, "step": 1405 }, { "epoch": 1.7535870243293825, "grad_norm": 0.10997539686939024, "learning_rate": 4.50540866418571e-06, "loss": 0.033, "num_tokens": 107169232.0, "step": 1406 }, { "epoch": 1.7548346849656893, "grad_norm": 0.10508302585453105, "learning_rate": 4.499499776202476e-06, "loss": 0.0365, "num_tokens": 107246281.0, "step": 1407 }, { "epoch": 1.7560823456019963, "grad_norm": 0.11986514390120263, "learning_rate": 4.493592702428558e-06, "loss": 0.0355, "num_tokens": 107322121.0, "step": 1408 }, { "epoch": 1.7573300062383033, "grad_norm": 0.11382666929205047, "learning_rate": 4.487687453575261e-06, "loss": 0.0349, "num_tokens": 107397732.0, "step": 1409 }, { "epoch": 1.75857766687461, "grad_norm": 0.11452741596501294, "learning_rate": 4.481784040350593e-06, "loss": 0.0377, "num_tokens": 107474105.0, "step": 1410 }, { "epoch": 1.759825327510917, "grad_norm": 0.11978232669192639, "learning_rate": 4.475882473459221e-06, "loss": 0.0346, "num_tokens": 107549721.0, "step": 1411 }, { "epoch": 1.761072988147224, "grad_norm": 0.10810211082067261, "learning_rate": 4.469982763602473e-06, "loss": 0.0377, "num_tokens": 107626252.0, "step": 1412 }, { "epoch": 1.7623206487835308, "grad_norm": 0.12430884790032933, "learning_rate": 4.464084921478303e-06, "loss": 0.0362, "num_tokens": 107701970.0, "step": 1413 }, { "epoch": 1.7635683094198378, "grad_norm": 0.1080229919974577, "learning_rate": 4.458188957781285e-06, "loss": 0.0363, "num_tokens": 107778230.0, "step": 1414 }, { "epoch": 1.7648159700561448, "grad_norm": 0.12546433579895047, "learning_rate": 4.452294883202581e-06, "loss": 0.0354, "num_tokens": 107854033.0, "step": 1415 }, { "epoch": 1.7660636306924515, "grad_norm": 0.10862233462213136, "learning_rate": 4.44640270842993e-06, "loss": 0.0344, "num_tokens": 107928602.0, "step": 1416 }, { "epoch": 1.7673112913287585, "grad_norm": 0.11234450273923224, "learning_rate": 4.440512444147626e-06, "loss": 0.0356, "num_tokens": 108005283.0, "step": 1417 }, { "epoch": 1.7685589519650655, "grad_norm": 0.12290779064818019, "learning_rate": 4.434624101036498e-06, "loss": 0.0408, "num_tokens": 108082409.0, "step": 1418 }, { "epoch": 1.7698066126013723, "grad_norm": 0.12019256254394751, "learning_rate": 4.4287376897738945e-06, "loss": 0.038, "num_tokens": 108159111.0, "step": 1419 }, { "epoch": 1.7710542732376795, "grad_norm": 0.12938504724279448, "learning_rate": 4.4228532210336535e-06, "loss": 0.0368, "num_tokens": 108236277.0, "step": 1420 }, { "epoch": 1.7723019338739863, "grad_norm": 0.12063659002771691, "learning_rate": 4.4169707054861e-06, "loss": 0.0356, "num_tokens": 108313163.0, "step": 1421 }, { "epoch": 1.773549594510293, "grad_norm": 0.12467033356761577, "learning_rate": 4.411090153798011e-06, "loss": 0.0357, "num_tokens": 108391348.0, "step": 1422 }, { "epoch": 1.7747972551466002, "grad_norm": 0.11455930343881551, "learning_rate": 4.405211576632602e-06, "loss": 0.0348, "num_tokens": 108467824.0, "step": 1423 }, { "epoch": 1.776044915782907, "grad_norm": 0.1069971790671395, "learning_rate": 4.3993349846495136e-06, "loss": 0.0341, "num_tokens": 108544120.0, "step": 1424 }, { "epoch": 1.777292576419214, "grad_norm": 0.11837827088623372, "learning_rate": 4.393460388504784e-06, "loss": 0.0355, "num_tokens": 108620246.0, "step": 1425 }, { "epoch": 1.778540237055521, "grad_norm": 0.11802718964613297, "learning_rate": 4.387587798850826e-06, "loss": 0.0358, "num_tokens": 108696646.0, "step": 1426 }, { "epoch": 1.7797878976918278, "grad_norm": 0.10473684840636525, "learning_rate": 4.381717226336426e-06, "loss": 0.0382, "num_tokens": 108773998.0, "step": 1427 }, { "epoch": 1.7810355583281348, "grad_norm": 0.1156664764311051, "learning_rate": 4.375848681606704e-06, "loss": 0.0349, "num_tokens": 108850108.0, "step": 1428 }, { "epoch": 1.7822832189644418, "grad_norm": 0.1186189185581627, "learning_rate": 4.369982175303104e-06, "loss": 0.0369, "num_tokens": 108927238.0, "step": 1429 }, { "epoch": 1.7835308796007485, "grad_norm": 0.12029945296887255, "learning_rate": 4.364117718063375e-06, "loss": 0.0362, "num_tokens": 109003852.0, "step": 1430 }, { "epoch": 1.7847785402370555, "grad_norm": 0.11467347487239687, "learning_rate": 4.358255320521553e-06, "loss": 0.0381, "num_tokens": 109080943.0, "step": 1431 }, { "epoch": 1.7860262008733625, "grad_norm": 0.11219671293656201, "learning_rate": 4.352394993307935e-06, "loss": 0.0353, "num_tokens": 109156308.0, "step": 1432 }, { "epoch": 1.7872738615096693, "grad_norm": 0.1195758432084249, "learning_rate": 4.346536747049068e-06, "loss": 0.0383, "num_tokens": 109233773.0, "step": 1433 }, { "epoch": 1.7885215221459763, "grad_norm": 0.12257041651086298, "learning_rate": 4.340680592367721e-06, "loss": 0.0367, "num_tokens": 109309017.0, "step": 1434 }, { "epoch": 1.7897691827822833, "grad_norm": 0.11644283699770981, "learning_rate": 4.33482653988287e-06, "loss": 0.0365, "num_tokens": 109386304.0, "step": 1435 }, { "epoch": 1.79101684341859, "grad_norm": 0.1159891995707703, "learning_rate": 4.328974600209687e-06, "loss": 0.0364, "num_tokens": 109462078.0, "step": 1436 }, { "epoch": 1.7922645040548972, "grad_norm": 0.11271455439524168, "learning_rate": 4.3231247839595045e-06, "loss": 0.0356, "num_tokens": 109538386.0, "step": 1437 }, { "epoch": 1.793512164691204, "grad_norm": 0.11081718708525366, "learning_rate": 4.317277101739806e-06, "loss": 0.0352, "num_tokens": 109614385.0, "step": 1438 }, { "epoch": 1.7947598253275108, "grad_norm": 0.1125273822950325, "learning_rate": 4.3114315641542105e-06, "loss": 0.0356, "num_tokens": 109690309.0, "step": 1439 }, { "epoch": 1.796007485963818, "grad_norm": 0.10944966687820053, "learning_rate": 4.305588181802441e-06, "loss": 0.0348, "num_tokens": 109766780.0, "step": 1440 }, { "epoch": 1.7972551466001248, "grad_norm": 0.11460545262390623, "learning_rate": 4.2997469652803185e-06, "loss": 0.0345, "num_tokens": 109843255.0, "step": 1441 }, { "epoch": 1.7985028072364317, "grad_norm": 0.11010943084688467, "learning_rate": 4.293907925179733e-06, "loss": 0.0365, "num_tokens": 109920554.0, "step": 1442 }, { "epoch": 1.7997504678727387, "grad_norm": 0.12295684254699428, "learning_rate": 4.28807107208863e-06, "loss": 0.0392, "num_tokens": 109997732.0, "step": 1443 }, { "epoch": 1.8009981285090455, "grad_norm": 0.13807569493569224, "learning_rate": 4.282236416590986e-06, "loss": 0.037, "num_tokens": 110074362.0, "step": 1444 }, { "epoch": 1.8022457891453525, "grad_norm": 0.11589812396647654, "learning_rate": 4.276403969266797e-06, "loss": 0.0334, "num_tokens": 110149499.0, "step": 1445 }, { "epoch": 1.8034934497816595, "grad_norm": 0.11874331032886756, "learning_rate": 4.270573740692053e-06, "loss": 0.0364, "num_tokens": 110225383.0, "step": 1446 }, { "epoch": 1.8047411104179663, "grad_norm": 0.10769893454787335, "learning_rate": 4.2647457414387205e-06, "loss": 0.0346, "num_tokens": 110300178.0, "step": 1447 }, { "epoch": 1.8059887710542732, "grad_norm": 0.10952676895213141, "learning_rate": 4.2589199820747226e-06, "loss": 0.0341, "num_tokens": 110378682.0, "step": 1448 }, { "epoch": 1.8072364316905802, "grad_norm": 0.11986450037494077, "learning_rate": 4.253096473163923e-06, "loss": 0.0357, "num_tokens": 110453993.0, "step": 1449 }, { "epoch": 1.808484092326887, "grad_norm": 0.10575750292359529, "learning_rate": 4.247275225266103e-06, "loss": 0.0368, "num_tokens": 110530495.0, "step": 1450 }, { "epoch": 1.809731752963194, "grad_norm": 0.1211718172176368, "learning_rate": 4.241456248936946e-06, "loss": 0.0374, "num_tokens": 110606906.0, "step": 1451 }, { "epoch": 1.810979413599501, "grad_norm": 0.11575168655236709, "learning_rate": 4.23563955472801e-06, "loss": 0.0357, "num_tokens": 110683315.0, "step": 1452 }, { "epoch": 1.8122270742358078, "grad_norm": 0.11205750328573585, "learning_rate": 4.229825153186727e-06, "loss": 0.0338, "num_tokens": 110758384.0, "step": 1453 }, { "epoch": 1.8134747348721147, "grad_norm": 0.1109122728392621, "learning_rate": 4.22401305485636e-06, "loss": 0.0362, "num_tokens": 110834195.0, "step": 1454 }, { "epoch": 1.8147223955084217, "grad_norm": 0.11438862268810039, "learning_rate": 4.218203270276e-06, "loss": 0.0355, "num_tokens": 110910964.0, "step": 1455 }, { "epoch": 1.8159700561447285, "grad_norm": 0.11552343707809745, "learning_rate": 4.2123958099805466e-06, "loss": 0.0364, "num_tokens": 110987489.0, "step": 1456 }, { "epoch": 1.8172177167810357, "grad_norm": 0.1128780835528746, "learning_rate": 4.206590684500675e-06, "loss": 0.0352, "num_tokens": 111063323.0, "step": 1457 }, { "epoch": 1.8184653774173425, "grad_norm": 0.11606306376773205, "learning_rate": 4.200787904362833e-06, "loss": 0.0365, "num_tokens": 111139548.0, "step": 1458 }, { "epoch": 1.8197130380536493, "grad_norm": 0.10291445670113526, "learning_rate": 4.194987480089218e-06, "loss": 0.0351, "num_tokens": 111216126.0, "step": 1459 }, { "epoch": 1.8209606986899565, "grad_norm": 0.11644649365731295, "learning_rate": 4.189189422197751e-06, "loss": 0.0348, "num_tokens": 111291904.0, "step": 1460 }, { "epoch": 1.8222083593262632, "grad_norm": 0.11302317074819644, "learning_rate": 4.183393741202065e-06, "loss": 0.0362, "num_tokens": 111367687.0, "step": 1461 }, { "epoch": 1.8234560199625702, "grad_norm": 0.11555775768411664, "learning_rate": 4.177600447611478e-06, "loss": 0.0358, "num_tokens": 111443817.0, "step": 1462 }, { "epoch": 1.8247036805988772, "grad_norm": 0.11640861821564545, "learning_rate": 4.171809551930985e-06, "loss": 0.0389, "num_tokens": 111521320.0, "step": 1463 }, { "epoch": 1.825951341235184, "grad_norm": 0.11354843479910538, "learning_rate": 4.166021064661231e-06, "loss": 0.038, "num_tokens": 111599182.0, "step": 1464 }, { "epoch": 1.827199001871491, "grad_norm": 0.11962115255796753, "learning_rate": 4.160234996298491e-06, "loss": 0.035, "num_tokens": 111675551.0, "step": 1465 }, { "epoch": 1.828446662507798, "grad_norm": 0.11860626927504106, "learning_rate": 4.154451357334654e-06, "loss": 0.0361, "num_tokens": 111754629.0, "step": 1466 }, { "epoch": 1.8296943231441047, "grad_norm": 0.1087452520721984, "learning_rate": 4.148670158257211e-06, "loss": 0.0359, "num_tokens": 111829688.0, "step": 1467 }, { "epoch": 1.8309419837804117, "grad_norm": 0.12262396185253335, "learning_rate": 4.142891409549219e-06, "loss": 0.036, "num_tokens": 111906321.0, "step": 1468 }, { "epoch": 1.8321896444167187, "grad_norm": 0.11301164282196123, "learning_rate": 4.137115121689297e-06, "loss": 0.035, "num_tokens": 111981952.0, "step": 1469 }, { "epoch": 1.8334373050530255, "grad_norm": 0.12102968567645797, "learning_rate": 4.131341305151603e-06, "loss": 0.0381, "num_tokens": 112058744.0, "step": 1470 }, { "epoch": 1.8346849656893325, "grad_norm": 0.11651089328857946, "learning_rate": 4.1255699704058085e-06, "loss": 0.0365, "num_tokens": 112135004.0, "step": 1471 }, { "epoch": 1.8359326263256395, "grad_norm": 0.12048112446389629, "learning_rate": 4.119801127917089e-06, "loss": 0.0358, "num_tokens": 112211639.0, "step": 1472 }, { "epoch": 1.8371802869619462, "grad_norm": 0.11324293941061209, "learning_rate": 4.114034788146101e-06, "loss": 0.0351, "num_tokens": 112287073.0, "step": 1473 }, { "epoch": 1.8384279475982532, "grad_norm": 0.11065510997356777, "learning_rate": 4.108270961548957e-06, "loss": 0.035, "num_tokens": 112362219.0, "step": 1474 }, { "epoch": 1.8396756082345602, "grad_norm": 0.1235587604718455, "learning_rate": 4.102509658577223e-06, "loss": 0.0358, "num_tokens": 112438186.0, "step": 1475 }, { "epoch": 1.840923268870867, "grad_norm": 0.10529808856557056, "learning_rate": 4.096750889677878e-06, "loss": 0.0332, "num_tokens": 112513405.0, "step": 1476 }, { "epoch": 1.8421709295071742, "grad_norm": 0.11603734080740498, "learning_rate": 4.090994665293313e-06, "loss": 0.0345, "num_tokens": 112588629.0, "step": 1477 }, { "epoch": 1.843418590143481, "grad_norm": 0.10787689083647872, "learning_rate": 4.085240995861301e-06, "loss": 0.0357, "num_tokens": 112664529.0, "step": 1478 }, { "epoch": 1.8446662507797877, "grad_norm": 0.11127451349079631, "learning_rate": 4.079489891814986e-06, "loss": 0.0355, "num_tokens": 112741228.0, "step": 1479 }, { "epoch": 1.845913911416095, "grad_norm": 0.13166792903179028, "learning_rate": 4.073741363582856e-06, "loss": 0.0371, "num_tokens": 112817011.0, "step": 1480 }, { "epoch": 1.8471615720524017, "grad_norm": 0.11657324844692983, "learning_rate": 4.06799542158873e-06, "loss": 0.0355, "num_tokens": 112892718.0, "step": 1481 }, { "epoch": 1.8484092326887087, "grad_norm": 0.10109047613936198, "learning_rate": 4.062252076251739e-06, "loss": 0.0349, "num_tokens": 112970058.0, "step": 1482 }, { "epoch": 1.8496568933250157, "grad_norm": 0.1095850182691673, "learning_rate": 4.056511337986304e-06, "loss": 0.0354, "num_tokens": 113046254.0, "step": 1483 }, { "epoch": 1.8509045539613225, "grad_norm": 0.1141555383648403, "learning_rate": 4.05077321720212e-06, "loss": 0.0377, "num_tokens": 113122580.0, "step": 1484 }, { "epoch": 1.8521522145976295, "grad_norm": 0.11965767255633106, "learning_rate": 4.045037724304129e-06, "loss": 0.0373, "num_tokens": 113199298.0, "step": 1485 }, { "epoch": 1.8533998752339365, "grad_norm": 0.12531984936041318, "learning_rate": 4.039304869692518e-06, "loss": 0.0374, "num_tokens": 113275703.0, "step": 1486 }, { "epoch": 1.8546475358702432, "grad_norm": 0.10134551113093508, "learning_rate": 4.033574663762685e-06, "loss": 0.0355, "num_tokens": 113351254.0, "step": 1487 }, { "epoch": 1.8558951965065502, "grad_norm": 0.11773826372613343, "learning_rate": 4.0278471169052224e-06, "loss": 0.0372, "num_tokens": 113427252.0, "step": 1488 }, { "epoch": 1.8571428571428572, "grad_norm": 0.11661724552812294, "learning_rate": 4.022122239505906e-06, "loss": 0.0358, "num_tokens": 113503323.0, "step": 1489 }, { "epoch": 1.858390517779164, "grad_norm": 0.11749716033346444, "learning_rate": 4.0164000419456715e-06, "loss": 0.0375, "num_tokens": 113580371.0, "step": 1490 }, { "epoch": 1.859638178415471, "grad_norm": 0.1305695968942797, "learning_rate": 4.010680534600587e-06, "loss": 0.0356, "num_tokens": 113656947.0, "step": 1491 }, { "epoch": 1.860885839051778, "grad_norm": 0.11359160111796697, "learning_rate": 4.004963727841852e-06, "loss": 0.0341, "num_tokens": 113732938.0, "step": 1492 }, { "epoch": 1.8621334996880847, "grad_norm": 0.10258761359426599, "learning_rate": 3.9992496320357645e-06, "loss": 0.0336, "num_tokens": 113809002.0, "step": 1493 }, { "epoch": 1.8633811603243917, "grad_norm": 0.11804430357862333, "learning_rate": 3.993538257543706e-06, "loss": 0.0328, "num_tokens": 113883796.0, "step": 1494 }, { "epoch": 1.8646288209606987, "grad_norm": 0.12483243738508598, "learning_rate": 3.987829614722124e-06, "loss": 0.0371, "num_tokens": 113960768.0, "step": 1495 }, { "epoch": 1.8658764815970055, "grad_norm": 0.1150151462214644, "learning_rate": 3.982123713922517e-06, "loss": 0.0358, "num_tokens": 114036180.0, "step": 1496 }, { "epoch": 1.8671241422333127, "grad_norm": 0.11869044286652397, "learning_rate": 3.976420565491404e-06, "loss": 0.0369, "num_tokens": 114111980.0, "step": 1497 }, { "epoch": 1.8683718028696195, "grad_norm": 0.11008881374766814, "learning_rate": 3.970720179770322e-06, "loss": 0.0346, "num_tokens": 114187724.0, "step": 1498 }, { "epoch": 1.8696194635059262, "grad_norm": 0.11080992842795516, "learning_rate": 3.965022567095788e-06, "loss": 0.0363, "num_tokens": 114263240.0, "step": 1499 }, { "epoch": 1.8708671241422334, "grad_norm": 0.1092058974188812, "learning_rate": 3.959327737799298e-06, "loss": 0.0353, "num_tokens": 114338209.0, "step": 1500 }, { "epoch": 1.8721147847785402, "grad_norm": 0.11386637728893095, "learning_rate": 3.953635702207299e-06, "loss": 0.0368, "num_tokens": 114415547.0, "step": 1501 }, { "epoch": 1.8733624454148472, "grad_norm": 0.1155459395703279, "learning_rate": 3.947946470641169e-06, "loss": 0.0366, "num_tokens": 114492678.0, "step": 1502 }, { "epoch": 1.8746101060511542, "grad_norm": 0.12225214682735673, "learning_rate": 3.9422600534172105e-06, "loss": 0.0373, "num_tokens": 114569035.0, "step": 1503 }, { "epoch": 1.875857766687461, "grad_norm": 0.11191387297145151, "learning_rate": 3.936576460846614e-06, "loss": 0.0345, "num_tokens": 114644776.0, "step": 1504 }, { "epoch": 1.877105427323768, "grad_norm": 0.1281184082430015, "learning_rate": 3.930895703235448e-06, "loss": 0.0358, "num_tokens": 114720908.0, "step": 1505 }, { "epoch": 1.878353087960075, "grad_norm": 0.11720996450584123, "learning_rate": 3.925217790884646e-06, "loss": 0.0334, "num_tokens": 114796173.0, "step": 1506 }, { "epoch": 1.8796007485963817, "grad_norm": 0.10440882158568637, "learning_rate": 3.919542734089978e-06, "loss": 0.0359, "num_tokens": 114872400.0, "step": 1507 }, { "epoch": 1.8808484092326887, "grad_norm": 0.11942205769199313, "learning_rate": 3.913870543142038e-06, "loss": 0.036, "num_tokens": 114948412.0, "step": 1508 }, { "epoch": 1.8820960698689957, "grad_norm": 0.11278966937999528, "learning_rate": 3.908201228326222e-06, "loss": 0.0371, "num_tokens": 115025101.0, "step": 1509 }, { "epoch": 1.8833437305053025, "grad_norm": 0.12097590188866718, "learning_rate": 3.902534799922713e-06, "loss": 0.034, "num_tokens": 115100026.0, "step": 1510 }, { "epoch": 1.8845913911416095, "grad_norm": 0.11125819036903851, "learning_rate": 3.896871268206456e-06, "loss": 0.0333, "num_tokens": 115175198.0, "step": 1511 }, { "epoch": 1.8858390517779164, "grad_norm": 0.13253062734929877, "learning_rate": 3.8912106434471486e-06, "loss": 0.0609, "num_tokens": 115251378.0, "step": 1512 }, { "epoch": 1.8870867124142232, "grad_norm": 0.12632121435115587, "learning_rate": 3.885552935909212e-06, "loss": 0.0362, "num_tokens": 115327638.0, "step": 1513 }, { "epoch": 1.8883343730505302, "grad_norm": 0.11211297136521399, "learning_rate": 3.879898155851779e-06, "loss": 0.0346, "num_tokens": 115404601.0, "step": 1514 }, { "epoch": 1.8895820336868372, "grad_norm": 0.10494844882789488, "learning_rate": 3.874246313528679e-06, "loss": 0.0341, "num_tokens": 115479999.0, "step": 1515 }, { "epoch": 1.890829694323144, "grad_norm": 0.10745141921792603, "learning_rate": 3.868597419188409e-06, "loss": 0.0352, "num_tokens": 115555643.0, "step": 1516 }, { "epoch": 1.8920773549594512, "grad_norm": 0.11732863353127684, "learning_rate": 3.862951483074119e-06, "loss": 0.0374, "num_tokens": 115632535.0, "step": 1517 }, { "epoch": 1.893325015595758, "grad_norm": 0.11654268076982084, "learning_rate": 3.857308515423601e-06, "loss": 0.0367, "num_tokens": 115710345.0, "step": 1518 }, { "epoch": 1.8945726762320647, "grad_norm": 0.11351548860109464, "learning_rate": 3.851668526469261e-06, "loss": 0.0351, "num_tokens": 115786951.0, "step": 1519 }, { "epoch": 1.895820336868372, "grad_norm": 0.11602516099241723, "learning_rate": 3.846031526438102e-06, "loss": 0.0398, "num_tokens": 115864384.0, "step": 1520 }, { "epoch": 1.8970679975046787, "grad_norm": 0.11306702153588803, "learning_rate": 3.84039752555171e-06, "loss": 0.0333, "num_tokens": 115940486.0, "step": 1521 }, { "epoch": 1.8983156581409857, "grad_norm": 0.1087232323419847, "learning_rate": 3.834766534026231e-06, "loss": 0.0342, "num_tokens": 116015978.0, "step": 1522 }, { "epoch": 1.8995633187772927, "grad_norm": 0.1108836772324257, "learning_rate": 3.829138562072353e-06, "loss": 0.0352, "num_tokens": 116091783.0, "step": 1523 }, { "epoch": 1.9008109794135994, "grad_norm": 0.12237650658630188, "learning_rate": 3.823513619895293e-06, "loss": 0.036, "num_tokens": 116167365.0, "step": 1524 }, { "epoch": 1.9020586400499064, "grad_norm": 0.10179348152240558, "learning_rate": 3.81789171769477e-06, "loss": 0.0349, "num_tokens": 116242859.0, "step": 1525 }, { "epoch": 1.9033063006862134, "grad_norm": 0.13829513509173896, "learning_rate": 3.812272865664994e-06, "loss": 0.0352, "num_tokens": 116319663.0, "step": 1526 }, { "epoch": 1.9045539613225202, "grad_norm": 0.1074739931014396, "learning_rate": 3.8066570739946394e-06, "loss": 0.0353, "num_tokens": 116396800.0, "step": 1527 }, { "epoch": 1.9058016219588272, "grad_norm": 0.1104254235716302, "learning_rate": 3.801044352866834e-06, "loss": 0.0338, "num_tokens": 116473030.0, "step": 1528 }, { "epoch": 1.9070492825951342, "grad_norm": 0.12040467420139594, "learning_rate": 3.7954347124591395e-06, "loss": 0.0381, "num_tokens": 116551011.0, "step": 1529 }, { "epoch": 1.908296943231441, "grad_norm": 0.12410608661446856, "learning_rate": 3.7898281629435286e-06, "loss": 0.0367, "num_tokens": 116627045.0, "step": 1530 }, { "epoch": 1.909544603867748, "grad_norm": 0.11448003184524622, "learning_rate": 3.7842247144863686e-06, "loss": 0.0329, "num_tokens": 116702568.0, "step": 1531 }, { "epoch": 1.910792264504055, "grad_norm": 0.1101418263894307, "learning_rate": 3.778624377248409e-06, "loss": 0.036, "num_tokens": 116778196.0, "step": 1532 }, { "epoch": 1.9120399251403617, "grad_norm": 0.1181310983123205, "learning_rate": 3.77302716138475e-06, "loss": 0.0351, "num_tokens": 116854139.0, "step": 1533 }, { "epoch": 1.913287585776669, "grad_norm": 0.12725788230803384, "learning_rate": 3.7674330770448374e-06, "loss": 0.0381, "num_tokens": 116930576.0, "step": 1534 }, { "epoch": 1.9145352464129757, "grad_norm": 0.1263442950413413, "learning_rate": 3.7618421343724386e-06, "loss": 0.0365, "num_tokens": 117010379.0, "step": 1535 }, { "epoch": 1.9157829070492824, "grad_norm": 0.11884255667209397, "learning_rate": 3.756254343505621e-06, "loss": 0.035, "num_tokens": 117086064.0, "step": 1536 }, { "epoch": 1.9170305676855897, "grad_norm": 0.11112128984019812, "learning_rate": 3.7506697145767367e-06, "loss": 0.0362, "num_tokens": 117162657.0, "step": 1537 }, { "epoch": 1.9182782283218964, "grad_norm": 0.11780736604644747, "learning_rate": 3.745088257712408e-06, "loss": 0.0373, "num_tokens": 117239941.0, "step": 1538 }, { "epoch": 1.9195258889582034, "grad_norm": 0.10960815283331832, "learning_rate": 3.7395099830335034e-06, "loss": 0.0341, "num_tokens": 117314955.0, "step": 1539 }, { "epoch": 1.9207735495945104, "grad_norm": 0.10987391122252545, "learning_rate": 3.7339349006551193e-06, "loss": 0.0362, "num_tokens": 117390856.0, "step": 1540 }, { "epoch": 1.9220212102308172, "grad_norm": 0.12286203958441236, "learning_rate": 3.7283630206865696e-06, "loss": 0.0348, "num_tokens": 117466317.0, "step": 1541 }, { "epoch": 1.9232688708671242, "grad_norm": 0.1136853088390901, "learning_rate": 3.7227943532313504e-06, "loss": 0.0371, "num_tokens": 117544324.0, "step": 1542 }, { "epoch": 1.9245165315034312, "grad_norm": 0.11982734473388944, "learning_rate": 3.7172289083871436e-06, "loss": 0.0362, "num_tokens": 117620317.0, "step": 1543 }, { "epoch": 1.925764192139738, "grad_norm": 0.12175325190701178, "learning_rate": 3.7116666962457813e-06, "loss": 0.0369, "num_tokens": 117696600.0, "step": 1544 }, { "epoch": 1.927011852776045, "grad_norm": 0.11092763932924518, "learning_rate": 3.7061077268932333e-06, "loss": 0.0351, "num_tokens": 117772431.0, "step": 1545 }, { "epoch": 1.928259513412352, "grad_norm": 0.11266769207847174, "learning_rate": 3.700552010409596e-06, "loss": 0.0377, "num_tokens": 117847727.0, "step": 1546 }, { "epoch": 1.9295071740486587, "grad_norm": 0.12131889331136142, "learning_rate": 3.694999556869059e-06, "loss": 0.0353, "num_tokens": 117923710.0, "step": 1547 }, { "epoch": 1.9307548346849657, "grad_norm": 0.10375682393956039, "learning_rate": 3.6894503763399003e-06, "loss": 0.0354, "num_tokens": 118000196.0, "step": 1548 }, { "epoch": 1.9320024953212727, "grad_norm": 0.12150821898167924, "learning_rate": 3.683904478884461e-06, "loss": 0.0368, "num_tokens": 118076950.0, "step": 1549 }, { "epoch": 1.9332501559575794, "grad_norm": 0.10745802148743575, "learning_rate": 3.67836187455913e-06, "loss": 0.0362, "num_tokens": 118152566.0, "step": 1550 }, { "epoch": 1.9344978165938864, "grad_norm": 0.11641734699356358, "learning_rate": 3.672822573414323e-06, "loss": 0.035, "num_tokens": 118228975.0, "step": 1551 }, { "epoch": 1.9357454772301934, "grad_norm": 0.113905776443254, "learning_rate": 3.6672865854944673e-06, "loss": 0.0361, "num_tokens": 118306295.0, "step": 1552 }, { "epoch": 1.9369931378665002, "grad_norm": 0.11805635474708707, "learning_rate": 3.6617539208379836e-06, "loss": 0.0344, "num_tokens": 118382040.0, "step": 1553 }, { "epoch": 1.9382407985028074, "grad_norm": 0.11668783333522152, "learning_rate": 3.656224589477264e-06, "loss": 0.0353, "num_tokens": 118457858.0, "step": 1554 }, { "epoch": 1.9394884591391142, "grad_norm": 0.11690683661248272, "learning_rate": 3.65069860143866e-06, "loss": 0.0351, "num_tokens": 118534064.0, "step": 1555 }, { "epoch": 1.940736119775421, "grad_norm": 0.12479882763354747, "learning_rate": 3.645175966742456e-06, "loss": 0.0361, "num_tokens": 118609978.0, "step": 1556 }, { "epoch": 1.9419837804117281, "grad_norm": 0.10499026791561165, "learning_rate": 3.639656695402858e-06, "loss": 0.035, "num_tokens": 118686812.0, "step": 1557 }, { "epoch": 1.943231441048035, "grad_norm": 0.11043958990536244, "learning_rate": 3.634140797427974e-06, "loss": 0.0355, "num_tokens": 118762829.0, "step": 1558 }, { "epoch": 1.944479101684342, "grad_norm": 0.12081921654678775, "learning_rate": 3.6286282828197904e-06, "loss": 0.0361, "num_tokens": 118838419.0, "step": 1559 }, { "epoch": 1.945726762320649, "grad_norm": 0.11487103811024217, "learning_rate": 3.623119161574169e-06, "loss": 0.0356, "num_tokens": 118914155.0, "step": 1560 }, { "epoch": 1.9469744229569557, "grad_norm": 0.11360686688440211, "learning_rate": 3.6176134436808074e-06, "loss": 0.0372, "num_tokens": 118990465.0, "step": 1561 }, { "epoch": 1.9482220835932627, "grad_norm": 0.11155062075729752, "learning_rate": 3.612111139123239e-06, "loss": 0.0332, "num_tokens": 119066805.0, "step": 1562 }, { "epoch": 1.9494697442295696, "grad_norm": 0.1127360747157812, "learning_rate": 3.6066122578788033e-06, "loss": 0.0354, "num_tokens": 119143624.0, "step": 1563 }, { "epoch": 1.9507174048658764, "grad_norm": 0.11141572186918379, "learning_rate": 3.6011168099186322e-06, "loss": 0.0361, "num_tokens": 119219639.0, "step": 1564 }, { "epoch": 1.9519650655021834, "grad_norm": 0.10663689560547111, "learning_rate": 3.5956248052076383e-06, "loss": 0.0346, "num_tokens": 119296081.0, "step": 1565 }, { "epoch": 1.9532127261384904, "grad_norm": 0.11391871771412312, "learning_rate": 3.5901362537044826e-06, "loss": 0.0349, "num_tokens": 119371763.0, "step": 1566 }, { "epoch": 1.9544603867747972, "grad_norm": 0.11909019651801186, "learning_rate": 3.584651165361568e-06, "loss": 0.0374, "num_tokens": 119448704.0, "step": 1567 }, { "epoch": 1.9557080474111042, "grad_norm": 0.10989187981100837, "learning_rate": 3.579169550125019e-06, "loss": 0.0373, "num_tokens": 119526959.0, "step": 1568 }, { "epoch": 1.9569557080474111, "grad_norm": 0.12551303966185134, "learning_rate": 3.5736914179346626e-06, "loss": 0.0377, "num_tokens": 119603721.0, "step": 1569 }, { "epoch": 1.958203368683718, "grad_norm": 0.11488802357223445, "learning_rate": 3.5682167787240053e-06, "loss": 0.0397, "num_tokens": 119682215.0, "step": 1570 }, { "epoch": 1.959451029320025, "grad_norm": 0.13645450743495965, "learning_rate": 3.5627456424202223e-06, "loss": 0.0348, "num_tokens": 119758988.0, "step": 1571 }, { "epoch": 1.960698689956332, "grad_norm": 0.11011490733351555, "learning_rate": 3.55727801894414e-06, "loss": 0.0338, "num_tokens": 119835006.0, "step": 1572 }, { "epoch": 1.9619463505926387, "grad_norm": 0.10610282995025075, "learning_rate": 3.5518139182102106e-06, "loss": 0.0351, "num_tokens": 119910802.0, "step": 1573 }, { "epoch": 1.9631940112289459, "grad_norm": 0.11487742308324417, "learning_rate": 3.5463533501265e-06, "loss": 0.0352, "num_tokens": 119986133.0, "step": 1574 }, { "epoch": 1.9644416718652526, "grad_norm": 0.11242988977931746, "learning_rate": 3.5408963245946714e-06, "loss": 0.0367, "num_tokens": 120062879.0, "step": 1575 }, { "epoch": 1.9656893325015594, "grad_norm": 0.12786058469254943, "learning_rate": 3.53544285150996e-06, "loss": 0.0342, "num_tokens": 120139678.0, "step": 1576 }, { "epoch": 1.9669369931378666, "grad_norm": 0.10314080876800046, "learning_rate": 3.529992940761159e-06, "loss": 0.0339, "num_tokens": 120215749.0, "step": 1577 }, { "epoch": 1.9681846537741734, "grad_norm": 0.10840591571518475, "learning_rate": 3.524546602230606e-06, "loss": 0.0368, "num_tokens": 120291566.0, "step": 1578 }, { "epoch": 1.9694323144104804, "grad_norm": 0.12653971547288437, "learning_rate": 3.5191038457941596e-06, "loss": 0.0354, "num_tokens": 120368332.0, "step": 1579 }, { "epoch": 1.9706799750467874, "grad_norm": 0.10154295626268736, "learning_rate": 3.5136646813211784e-06, "loss": 0.0326, "num_tokens": 120443551.0, "step": 1580 }, { "epoch": 1.9719276356830941, "grad_norm": 0.10391823409071556, "learning_rate": 3.5082291186745145e-06, "loss": 0.0359, "num_tokens": 120520401.0, "step": 1581 }, { "epoch": 1.9731752963194011, "grad_norm": 0.12143861281656622, "learning_rate": 3.5027971677104867e-06, "loss": 0.0364, "num_tokens": 120596577.0, "step": 1582 }, { "epoch": 1.9744229569557081, "grad_norm": 0.11710635362818253, "learning_rate": 3.497368838278862e-06, "loss": 0.0365, "num_tokens": 120672610.0, "step": 1583 }, { "epoch": 1.975670617592015, "grad_norm": 0.10505989387647319, "learning_rate": 3.491944140222845e-06, "loss": 0.0341, "num_tokens": 120748072.0, "step": 1584 }, { "epoch": 1.976918278228322, "grad_norm": 0.11325553204443413, "learning_rate": 3.486523083379051e-06, "loss": 0.035, "num_tokens": 120824607.0, "step": 1585 }, { "epoch": 1.9781659388646289, "grad_norm": 0.10616594120542275, "learning_rate": 3.481105677577493e-06, "loss": 0.0337, "num_tokens": 120899699.0, "step": 1586 }, { "epoch": 1.9794135995009356, "grad_norm": 0.11658220768049024, "learning_rate": 3.475691932641569e-06, "loss": 0.0356, "num_tokens": 120976512.0, "step": 1587 }, { "epoch": 1.9806612601372426, "grad_norm": 0.11335818137173843, "learning_rate": 3.4702818583880305e-06, "loss": 0.0331, "num_tokens": 121051822.0, "step": 1588 }, { "epoch": 1.9819089207735496, "grad_norm": 0.10858050015524784, "learning_rate": 3.46487546462698e-06, "loss": 0.0346, "num_tokens": 121127103.0, "step": 1589 }, { "epoch": 1.9831565814098564, "grad_norm": 0.12886983678658803, "learning_rate": 3.4594727611618462e-06, "loss": 0.0411, "num_tokens": 121204889.0, "step": 1590 }, { "epoch": 1.9844042420461634, "grad_norm": 0.1124176219267256, "learning_rate": 3.454073757789359e-06, "loss": 0.0365, "num_tokens": 121280596.0, "step": 1591 }, { "epoch": 1.9856519026824704, "grad_norm": 0.11711254550831095, "learning_rate": 3.4486784642995442e-06, "loss": 0.0355, "num_tokens": 121358235.0, "step": 1592 }, { "epoch": 1.9868995633187772, "grad_norm": 0.12398515386115859, "learning_rate": 3.4432868904757024e-06, "loss": 0.035, "num_tokens": 121434785.0, "step": 1593 }, { "epoch": 1.9881472239550844, "grad_norm": 0.11418604832158563, "learning_rate": 3.437899046094384e-06, "loss": 0.0348, "num_tokens": 121511577.0, "step": 1594 }, { "epoch": 1.9893948845913911, "grad_norm": 0.112301925129123, "learning_rate": 3.432514940925378e-06, "loss": 0.036, "num_tokens": 121588869.0, "step": 1595 }, { "epoch": 1.990642545227698, "grad_norm": 0.10984815919551645, "learning_rate": 3.4271345847316974e-06, "loss": 0.0378, "num_tokens": 121667192.0, "step": 1596 }, { "epoch": 1.9918902058640051, "grad_norm": 0.1126552804474371, "learning_rate": 3.421757987269554e-06, "loss": 0.0369, "num_tokens": 121743520.0, "step": 1597 }, { "epoch": 1.9931378665003119, "grad_norm": 0.11902040360206023, "learning_rate": 3.416385158288343e-06, "loss": 0.0358, "num_tokens": 121820019.0, "step": 1598 }, { "epoch": 1.9943855271366189, "grad_norm": 0.11822457506008535, "learning_rate": 3.411016107530628e-06, "loss": 0.0338, "num_tokens": 121896092.0, "step": 1599 }, { "epoch": 1.9956331877729259, "grad_norm": 0.11654657768961584, "learning_rate": 3.405650844732122e-06, "loss": 0.036, "num_tokens": 121973290.0, "step": 1600 }, { "epoch": 1.9968808484092326, "grad_norm": 0.1073126712837225, "learning_rate": 3.400289379621664e-06, "loss": 0.0353, "num_tokens": 122048846.0, "step": 1601 }, { "epoch": 1.9981285090455396, "grad_norm": 0.11588726397968585, "learning_rate": 3.394931721921214e-06, "loss": 0.036, "num_tokens": 122124025.0, "step": 1602 }, { "epoch": 1.9993761696818466, "grad_norm": 0.11056330436185885, "learning_rate": 3.3895778813458256e-06, "loss": 0.0354, "num_tokens": 122200456.0, "step": 1603 }, { "epoch": 2.0, "grad_norm": 0.16899944282914875, "learning_rate": 3.3842278676036293e-06, "loss": 0.0342, "num_tokens": 122238397.0, "step": 1604 }, { "epoch": 2.0012476606363068, "grad_norm": 0.09094457519669298, "learning_rate": 3.3788816903958145e-06, "loss": 0.0293, "num_tokens": 122313704.0, "step": 1605 }, { "epoch": 2.002495321272614, "grad_norm": 0.10470187666679959, "learning_rate": 3.37353935941662e-06, "loss": 0.031, "num_tokens": 122389665.0, "step": 1606 }, { "epoch": 2.0037429819089208, "grad_norm": 0.10261607757231991, "learning_rate": 3.3682008843533055e-06, "loss": 0.0306, "num_tokens": 122465436.0, "step": 1607 }, { "epoch": 2.0049906425452275, "grad_norm": 0.10768005430067183, "learning_rate": 3.3628662748861374e-06, "loss": 0.0301, "num_tokens": 122542091.0, "step": 1608 }, { "epoch": 2.0062383031815347, "grad_norm": 0.11091901445964379, "learning_rate": 3.357535540688379e-06, "loss": 0.0294, "num_tokens": 122617623.0, "step": 1609 }, { "epoch": 2.0074859638178415, "grad_norm": 0.10028034432839329, "learning_rate": 3.3522086914262585e-06, "loss": 0.031, "num_tokens": 122694522.0, "step": 1610 }, { "epoch": 2.0087336244541483, "grad_norm": 0.10536087513490514, "learning_rate": 3.3468857367589665e-06, "loss": 0.0314, "num_tokens": 122771954.0, "step": 1611 }, { "epoch": 2.0099812850904555, "grad_norm": 0.10430826658048525, "learning_rate": 3.3415666863386298e-06, "loss": 0.0297, "num_tokens": 122848328.0, "step": 1612 }, { "epoch": 2.0112289457267623, "grad_norm": 0.10453848093823005, "learning_rate": 3.3362515498102934e-06, "loss": 0.0303, "num_tokens": 122924877.0, "step": 1613 }, { "epoch": 2.0124766063630695, "grad_norm": 0.10028839319505682, "learning_rate": 3.330940336811903e-06, "loss": 0.029, "num_tokens": 123000873.0, "step": 1614 }, { "epoch": 2.0137242669993762, "grad_norm": 0.11743564991525901, "learning_rate": 3.325633056974298e-06, "loss": 0.0295, "num_tokens": 123076071.0, "step": 1615 }, { "epoch": 2.014971927635683, "grad_norm": 0.11146806831799687, "learning_rate": 3.3203297199211794e-06, "loss": 0.0289, "num_tokens": 123151982.0, "step": 1616 }, { "epoch": 2.01621958827199, "grad_norm": 0.12382288674623765, "learning_rate": 3.315030335269096e-06, "loss": 0.0293, "num_tokens": 123228726.0, "step": 1617 }, { "epoch": 2.017467248908297, "grad_norm": 0.11950129176714563, "learning_rate": 3.309734912627441e-06, "loss": 0.0277, "num_tokens": 123304605.0, "step": 1618 }, { "epoch": 2.0187149095446038, "grad_norm": 0.13106824723008773, "learning_rate": 3.304443461598413e-06, "loss": 0.0295, "num_tokens": 123382565.0, "step": 1619 }, { "epoch": 2.019962570180911, "grad_norm": 0.1441982452291627, "learning_rate": 3.299155991777011e-06, "loss": 0.0307, "num_tokens": 123457876.0, "step": 1620 }, { "epoch": 2.0212102308172177, "grad_norm": 0.1325658173846908, "learning_rate": 3.2938725127510185e-06, "loss": 0.0292, "num_tokens": 123533788.0, "step": 1621 }, { "epoch": 2.0224578914535245, "grad_norm": 0.1297831271928566, "learning_rate": 3.2885930341009774e-06, "loss": 0.0293, "num_tokens": 123609961.0, "step": 1622 }, { "epoch": 2.0237055520898317, "grad_norm": 0.121658116064928, "learning_rate": 3.2833175654001787e-06, "loss": 0.0283, "num_tokens": 123687877.0, "step": 1623 }, { "epoch": 2.0249532127261385, "grad_norm": 0.1180258767424536, "learning_rate": 3.278046116214642e-06, "loss": 0.0283, "num_tokens": 123764590.0, "step": 1624 }, { "epoch": 2.0262008733624453, "grad_norm": 0.11629528263227036, "learning_rate": 3.272778696103099e-06, "loss": 0.0282, "num_tokens": 123841467.0, "step": 1625 }, { "epoch": 2.0274485339987525, "grad_norm": 0.11430394671374194, "learning_rate": 3.2675153146169736e-06, "loss": 0.0284, "num_tokens": 123917776.0, "step": 1626 }, { "epoch": 2.0286961946350592, "grad_norm": 0.11713709735142017, "learning_rate": 3.2622559813003684e-06, "loss": 0.0286, "num_tokens": 123993601.0, "step": 1627 }, { "epoch": 2.029943855271366, "grad_norm": 0.11804813522003368, "learning_rate": 3.2570007056900437e-06, "loss": 0.0287, "num_tokens": 124069540.0, "step": 1628 }, { "epoch": 2.031191515907673, "grad_norm": 0.13641435448063324, "learning_rate": 3.2517494973154008e-06, "loss": 0.0311, "num_tokens": 124146287.0, "step": 1629 }, { "epoch": 2.03243917654398, "grad_norm": 0.11999347396308309, "learning_rate": 3.2465023656984707e-06, "loss": 0.0284, "num_tokens": 124222142.0, "step": 1630 }, { "epoch": 2.0336868371802868, "grad_norm": 0.11183330241969212, "learning_rate": 3.2412593203538857e-06, "loss": 0.0293, "num_tokens": 124298900.0, "step": 1631 }, { "epoch": 2.034934497816594, "grad_norm": 0.11671051353373899, "learning_rate": 3.236020370788876e-06, "loss": 0.0302, "num_tokens": 124374841.0, "step": 1632 }, { "epoch": 2.0361821584529007, "grad_norm": 0.11468680865831452, "learning_rate": 3.230785526503236e-06, "loss": 0.0288, "num_tokens": 124451278.0, "step": 1633 }, { "epoch": 2.037429819089208, "grad_norm": 0.12259831247354039, "learning_rate": 3.225554796989325e-06, "loss": 0.0307, "num_tokens": 124528188.0, "step": 1634 }, { "epoch": 2.0386774797255147, "grad_norm": 0.126955831725854, "learning_rate": 3.2203281917320328e-06, "loss": 0.0309, "num_tokens": 124605124.0, "step": 1635 }, { "epoch": 2.0399251403618215, "grad_norm": 0.1266009278644561, "learning_rate": 3.2151057202087783e-06, "loss": 0.0297, "num_tokens": 124682843.0, "step": 1636 }, { "epoch": 2.0411728009981287, "grad_norm": 0.11629716217576579, "learning_rate": 3.209887391889479e-06, "loss": 0.0289, "num_tokens": 124761248.0, "step": 1637 }, { "epoch": 2.0424204616344355, "grad_norm": 0.11719507314291479, "learning_rate": 3.204673216236539e-06, "loss": 0.03, "num_tokens": 124836567.0, "step": 1638 }, { "epoch": 2.0436681222707422, "grad_norm": 0.13293749631217391, "learning_rate": 3.199463202704838e-06, "loss": 0.0301, "num_tokens": 124914356.0, "step": 1639 }, { "epoch": 2.0449157829070495, "grad_norm": 0.13159382652993765, "learning_rate": 3.194257360741706e-06, "loss": 0.0302, "num_tokens": 124992108.0, "step": 1640 }, { "epoch": 2.046163443543356, "grad_norm": 0.10907795483596473, "learning_rate": 3.189055699786906e-06, "loss": 0.0287, "num_tokens": 125068440.0, "step": 1641 }, { "epoch": 2.047411104179663, "grad_norm": 0.1323478428940742, "learning_rate": 3.1838582292726206e-06, "loss": 0.031, "num_tokens": 125145344.0, "step": 1642 }, { "epoch": 2.04865876481597, "grad_norm": 0.11288069563704346, "learning_rate": 3.1786649586234373e-06, "loss": 0.0282, "num_tokens": 125220797.0, "step": 1643 }, { "epoch": 2.049906425452277, "grad_norm": 0.1305477821978057, "learning_rate": 3.173475897256325e-06, "loss": 0.0297, "num_tokens": 125296316.0, "step": 1644 }, { "epoch": 2.0511540860885837, "grad_norm": 0.11482519476065899, "learning_rate": 3.1682910545806167e-06, "loss": 0.0286, "num_tokens": 125373059.0, "step": 1645 }, { "epoch": 2.052401746724891, "grad_norm": 0.11295210479276795, "learning_rate": 3.1631104399980053e-06, "loss": 0.0281, "num_tokens": 125449203.0, "step": 1646 }, { "epoch": 2.0536494073611977, "grad_norm": 0.12032389651169483, "learning_rate": 3.157934062902508e-06, "loss": 0.0298, "num_tokens": 125525599.0, "step": 1647 }, { "epoch": 2.0548970679975045, "grad_norm": 0.11902065154734578, "learning_rate": 3.1527619326804594e-06, "loss": 0.0286, "num_tokens": 125601496.0, "step": 1648 }, { "epoch": 2.0561447286338117, "grad_norm": 0.11410725158268568, "learning_rate": 3.147594058710498e-06, "loss": 0.0286, "num_tokens": 125677905.0, "step": 1649 }, { "epoch": 2.0573923892701185, "grad_norm": 0.11816837329563465, "learning_rate": 3.14243045036354e-06, "loss": 0.0299, "num_tokens": 125756319.0, "step": 1650 }, { "epoch": 2.0586400499064252, "grad_norm": 0.11995125205915291, "learning_rate": 3.1372711170027666e-06, "loss": 0.0285, "num_tokens": 125832029.0, "step": 1651 }, { "epoch": 2.0598877105427325, "grad_norm": 0.12010979336540205, "learning_rate": 3.13211606798361e-06, "loss": 0.0294, "num_tokens": 125908359.0, "step": 1652 }, { "epoch": 2.061135371179039, "grad_norm": 0.11991952735264713, "learning_rate": 3.1269653126537344e-06, "loss": 0.029, "num_tokens": 125985504.0, "step": 1653 }, { "epoch": 2.0623830318153464, "grad_norm": 0.12245491314302823, "learning_rate": 3.121818860353011e-06, "loss": 0.0285, "num_tokens": 126061654.0, "step": 1654 }, { "epoch": 2.063630692451653, "grad_norm": 0.11426788535740982, "learning_rate": 3.116676720413519e-06, "loss": 0.0285, "num_tokens": 126138177.0, "step": 1655 }, { "epoch": 2.06487835308796, "grad_norm": 0.12048808823445821, "learning_rate": 3.11153890215951e-06, "loss": 0.0294, "num_tokens": 126214439.0, "step": 1656 }, { "epoch": 2.066126013724267, "grad_norm": 0.11638863421566648, "learning_rate": 3.1064054149073984e-06, "loss": 0.0289, "num_tokens": 126289984.0, "step": 1657 }, { "epoch": 2.067373674360574, "grad_norm": 0.13366366869107843, "learning_rate": 3.1012762679657525e-06, "loss": 0.0297, "num_tokens": 126365459.0, "step": 1658 }, { "epoch": 2.0686213349968807, "grad_norm": 0.11864627470251912, "learning_rate": 3.0961514706352654e-06, "loss": 0.0297, "num_tokens": 126442037.0, "step": 1659 }, { "epoch": 2.069868995633188, "grad_norm": 0.1204172393930324, "learning_rate": 3.09103103220874e-06, "loss": 0.0285, "num_tokens": 126517872.0, "step": 1660 }, { "epoch": 2.0711166562694947, "grad_norm": 0.11684206507561638, "learning_rate": 3.085914961971082e-06, "loss": 0.0285, "num_tokens": 126593424.0, "step": 1661 }, { "epoch": 2.0723643169058015, "grad_norm": 0.11878319279046977, "learning_rate": 3.080803269199275e-06, "loss": 0.0291, "num_tokens": 126668395.0, "step": 1662 }, { "epoch": 2.0736119775421087, "grad_norm": 0.13003391776490136, "learning_rate": 3.0756959631623583e-06, "loss": 0.0295, "num_tokens": 126744950.0, "step": 1663 }, { "epoch": 2.0748596381784155, "grad_norm": 0.11180545254543592, "learning_rate": 3.0705930531214255e-06, "loss": 0.0298, "num_tokens": 126820804.0, "step": 1664 }, { "epoch": 2.0761072988147222, "grad_norm": 0.12429937470575604, "learning_rate": 3.065494548329594e-06, "loss": 0.0293, "num_tokens": 126898763.0, "step": 1665 }, { "epoch": 2.0773549594510294, "grad_norm": 0.11939233163914764, "learning_rate": 3.060400458031991e-06, "loss": 0.0299, "num_tokens": 126975067.0, "step": 1666 }, { "epoch": 2.078602620087336, "grad_norm": 0.11859731452377882, "learning_rate": 3.055310791465744e-06, "loss": 0.0293, "num_tokens": 127051917.0, "step": 1667 }, { "epoch": 2.079850280723643, "grad_norm": 0.10596465405559069, "learning_rate": 3.0502255578599594e-06, "loss": 0.0286, "num_tokens": 127126581.0, "step": 1668 }, { "epoch": 2.08109794135995, "grad_norm": 0.12147255599548454, "learning_rate": 3.0451447664357005e-06, "loss": 0.0305, "num_tokens": 127203055.0, "step": 1669 }, { "epoch": 2.082345601996257, "grad_norm": 0.11456004309224181, "learning_rate": 3.040068426405976e-06, "loss": 0.0289, "num_tokens": 127279082.0, "step": 1670 }, { "epoch": 2.083593262632564, "grad_norm": 0.1099209222662035, "learning_rate": 3.0349965469757283e-06, "loss": 0.0288, "num_tokens": 127354973.0, "step": 1671 }, { "epoch": 2.084840923268871, "grad_norm": 0.1092793466680077, "learning_rate": 3.0299291373418038e-06, "loss": 0.0294, "num_tokens": 127430677.0, "step": 1672 }, { "epoch": 2.0860885839051777, "grad_norm": 0.1140484049265834, "learning_rate": 3.024866206692953e-06, "loss": 0.029, "num_tokens": 127508449.0, "step": 1673 }, { "epoch": 2.087336244541485, "grad_norm": 0.11781551488387931, "learning_rate": 3.0198077642097945e-06, "loss": 0.0299, "num_tokens": 127584320.0, "step": 1674 }, { "epoch": 2.0885839051777917, "grad_norm": 0.11904168811025281, "learning_rate": 3.014753819064817e-06, "loss": 0.0293, "num_tokens": 127661488.0, "step": 1675 }, { "epoch": 2.0898315658140985, "grad_norm": 0.11711225032175877, "learning_rate": 3.009704380422348e-06, "loss": 0.0291, "num_tokens": 127737991.0, "step": 1676 }, { "epoch": 2.0910792264504057, "grad_norm": 0.10612396078457137, "learning_rate": 3.004659457438548e-06, "loss": 0.0282, "num_tokens": 127814066.0, "step": 1677 }, { "epoch": 2.0923268870867124, "grad_norm": 0.11501420629637818, "learning_rate": 2.999619059261387e-06, "loss": 0.0294, "num_tokens": 127890907.0, "step": 1678 }, { "epoch": 2.093574547723019, "grad_norm": 0.11619371324958405, "learning_rate": 2.9945831950306285e-06, "loss": 0.029, "num_tokens": 127966833.0, "step": 1679 }, { "epoch": 2.0948222083593264, "grad_norm": 0.1344217605260488, "learning_rate": 2.9895518738778196e-06, "loss": 0.0303, "num_tokens": 128043513.0, "step": 1680 }, { "epoch": 2.096069868995633, "grad_norm": 0.11509467874428474, "learning_rate": 2.984525104926262e-06, "loss": 0.0287, "num_tokens": 128120826.0, "step": 1681 }, { "epoch": 2.09731752963194, "grad_norm": 0.13048016716661157, "learning_rate": 2.97950289729101e-06, "loss": 0.0285, "num_tokens": 128197592.0, "step": 1682 }, { "epoch": 2.098565190268247, "grad_norm": 0.11403952953370168, "learning_rate": 2.974485260078846e-06, "loss": 0.0284, "num_tokens": 128273160.0, "step": 1683 }, { "epoch": 2.099812850904554, "grad_norm": 0.11301234594867011, "learning_rate": 2.9694722023882607e-06, "loss": 0.0288, "num_tokens": 128349131.0, "step": 1684 }, { "epoch": 2.1010605115408607, "grad_norm": 0.11308339383694495, "learning_rate": 2.9644637333094404e-06, "loss": 0.0286, "num_tokens": 128424743.0, "step": 1685 }, { "epoch": 2.102308172177168, "grad_norm": 0.13222972121674195, "learning_rate": 2.959459861924258e-06, "loss": 0.0321, "num_tokens": 128500534.0, "step": 1686 }, { "epoch": 2.1035558328134747, "grad_norm": 0.11118702677552449, "learning_rate": 2.954460597306242e-06, "loss": 0.028, "num_tokens": 128576822.0, "step": 1687 }, { "epoch": 2.1048034934497815, "grad_norm": 0.12542704234184968, "learning_rate": 2.9494659485205683e-06, "loss": 0.0295, "num_tokens": 128653520.0, "step": 1688 }, { "epoch": 2.1060511540860887, "grad_norm": 0.11687630320165059, "learning_rate": 2.9444759246240505e-06, "loss": 0.0291, "num_tokens": 128730266.0, "step": 1689 }, { "epoch": 2.1072988147223954, "grad_norm": 0.11704520647268402, "learning_rate": 2.939490534665107e-06, "loss": 0.0303, "num_tokens": 128806281.0, "step": 1690 }, { "epoch": 2.108546475358702, "grad_norm": 0.12007099225710577, "learning_rate": 2.934509787683755e-06, "loss": 0.0296, "num_tokens": 128882197.0, "step": 1691 }, { "epoch": 2.1097941359950094, "grad_norm": 0.12894656412831512, "learning_rate": 2.929533692711598e-06, "loss": 0.0288, "num_tokens": 128958255.0, "step": 1692 }, { "epoch": 2.111041796631316, "grad_norm": 0.13635018499385967, "learning_rate": 2.9245622587717982e-06, "loss": 0.0317, "num_tokens": 129034456.0, "step": 1693 }, { "epoch": 2.1122894572676234, "grad_norm": 0.1188141113279919, "learning_rate": 2.919595494879065e-06, "loss": 0.0288, "num_tokens": 129111270.0, "step": 1694 }, { "epoch": 2.11353711790393, "grad_norm": 0.1218554043702889, "learning_rate": 2.9146334100396474e-06, "loss": 0.0281, "num_tokens": 129187037.0, "step": 1695 }, { "epoch": 2.114784778540237, "grad_norm": 0.11375072712332324, "learning_rate": 2.9096760132513036e-06, "loss": 0.0292, "num_tokens": 129263591.0, "step": 1696 }, { "epoch": 2.116032439176544, "grad_norm": 0.1226238587716704, "learning_rate": 2.9047233135032927e-06, "loss": 0.0286, "num_tokens": 129339359.0, "step": 1697 }, { "epoch": 2.117280099812851, "grad_norm": 0.1144029375418943, "learning_rate": 2.8997753197763532e-06, "loss": 0.0303, "num_tokens": 129415566.0, "step": 1698 }, { "epoch": 2.1185277604491577, "grad_norm": 0.12621177470688724, "learning_rate": 2.894832041042699e-06, "loss": 0.0302, "num_tokens": 129492927.0, "step": 1699 }, { "epoch": 2.119775421085465, "grad_norm": 0.13278516666491638, "learning_rate": 2.8898934862659823e-06, "loss": 0.0463, "num_tokens": 129570623.0, "step": 1700 }, { "epoch": 2.1210230817217717, "grad_norm": 0.12206158358190072, "learning_rate": 2.8849596644013e-06, "loss": 0.0303, "num_tokens": 129646983.0, "step": 1701 }, { "epoch": 2.1222707423580784, "grad_norm": 0.15346776037825924, "learning_rate": 2.880030584395162e-06, "loss": 0.029, "num_tokens": 129723058.0, "step": 1702 }, { "epoch": 2.1235184029943857, "grad_norm": 0.13346485239158706, "learning_rate": 2.8751062551854775e-06, "loss": 0.0292, "num_tokens": 129799644.0, "step": 1703 }, { "epoch": 2.1247660636306924, "grad_norm": 0.12080827755014605, "learning_rate": 2.870186685701545e-06, "loss": 0.0296, "num_tokens": 129875807.0, "step": 1704 }, { "epoch": 2.126013724266999, "grad_norm": 0.11778339398723604, "learning_rate": 2.8652718848640337e-06, "loss": 0.0281, "num_tokens": 129952062.0, "step": 1705 }, { "epoch": 2.1272613849033064, "grad_norm": 0.12815407188577152, "learning_rate": 2.8603618615849603e-06, "loss": 0.0292, "num_tokens": 130029889.0, "step": 1706 }, { "epoch": 2.128509045539613, "grad_norm": 0.12170030398585802, "learning_rate": 2.8554566247676806e-06, "loss": 0.0304, "num_tokens": 130106278.0, "step": 1707 }, { "epoch": 2.12975670617592, "grad_norm": 0.13564222415311583, "learning_rate": 2.850556183306874e-06, "loss": 0.0298, "num_tokens": 130182742.0, "step": 1708 }, { "epoch": 2.131004366812227, "grad_norm": 0.12092186918308077, "learning_rate": 2.845660546088519e-06, "loss": 0.0292, "num_tokens": 130258018.0, "step": 1709 }, { "epoch": 2.132252027448534, "grad_norm": 0.12388745329777386, "learning_rate": 2.8407697219898865e-06, "loss": 0.0292, "num_tokens": 130334446.0, "step": 1710 }, { "epoch": 2.133499688084841, "grad_norm": 0.12187584450348393, "learning_rate": 2.8358837198795223e-06, "loss": 0.0292, "num_tokens": 130409895.0, "step": 1711 }, { "epoch": 2.134747348721148, "grad_norm": 0.11869077268783002, "learning_rate": 2.8310025486172223e-06, "loss": 0.0296, "num_tokens": 130486706.0, "step": 1712 }, { "epoch": 2.1359950093574547, "grad_norm": 0.10293036941377427, "learning_rate": 2.8261262170540242e-06, "loss": 0.028, "num_tokens": 130562325.0, "step": 1713 }, { "epoch": 2.137242669993762, "grad_norm": 0.1128646409689079, "learning_rate": 2.821254734032194e-06, "loss": 0.0292, "num_tokens": 130638649.0, "step": 1714 }, { "epoch": 2.1384903306300687, "grad_norm": 0.1206922280211655, "learning_rate": 2.8163881083852e-06, "loss": 0.0284, "num_tokens": 130715765.0, "step": 1715 }, { "epoch": 2.1397379912663754, "grad_norm": 0.1110611156235701, "learning_rate": 2.811526348937706e-06, "loss": 0.0293, "num_tokens": 130791789.0, "step": 1716 }, { "epoch": 2.1409856519026826, "grad_norm": 0.12851385903558105, "learning_rate": 2.806669464505552e-06, "loss": 0.0291, "num_tokens": 130868440.0, "step": 1717 }, { "epoch": 2.1422333125389894, "grad_norm": 0.10535176804674477, "learning_rate": 2.80181746389574e-06, "loss": 0.028, "num_tokens": 130945015.0, "step": 1718 }, { "epoch": 2.143480973175296, "grad_norm": 0.11822191143631985, "learning_rate": 2.7969703559064076e-06, "loss": 0.0289, "num_tokens": 131022857.0, "step": 1719 }, { "epoch": 2.1447286338116034, "grad_norm": 0.12991511557728588, "learning_rate": 2.792128149326833e-06, "loss": 0.0305, "num_tokens": 131099510.0, "step": 1720 }, { "epoch": 2.14597629444791, "grad_norm": 0.12718239799706485, "learning_rate": 2.7872908529373976e-06, "loss": 0.0301, "num_tokens": 131176096.0, "step": 1721 }, { "epoch": 2.147223955084217, "grad_norm": 0.11554797810203757, "learning_rate": 2.782458475509581e-06, "loss": 0.0302, "num_tokens": 131252380.0, "step": 1722 }, { "epoch": 2.148471615720524, "grad_norm": 0.12320833041672591, "learning_rate": 2.7776310258059447e-06, "loss": 0.03, "num_tokens": 131328378.0, "step": 1723 }, { "epoch": 2.149719276356831, "grad_norm": 0.11326383473860072, "learning_rate": 2.772808512580114e-06, "loss": 0.0288, "num_tokens": 131405405.0, "step": 1724 }, { "epoch": 2.1509669369931377, "grad_norm": 0.11951009210803026, "learning_rate": 2.767990944576763e-06, "loss": 0.0286, "num_tokens": 131481573.0, "step": 1725 }, { "epoch": 2.152214597629445, "grad_norm": 0.11801895008095241, "learning_rate": 2.7631783305316017e-06, "loss": 0.029, "num_tokens": 131557531.0, "step": 1726 }, { "epoch": 2.1534622582657517, "grad_norm": 0.10989109836822063, "learning_rate": 2.7583706791713503e-06, "loss": 0.0287, "num_tokens": 131633233.0, "step": 1727 }, { "epoch": 2.154709918902059, "grad_norm": 0.11913978011568563, "learning_rate": 2.7535679992137338e-06, "loss": 0.0277, "num_tokens": 131709262.0, "step": 1728 }, { "epoch": 2.1559575795383656, "grad_norm": 0.12802965808075728, "learning_rate": 2.7487702993674647e-06, "loss": 0.0298, "num_tokens": 131785431.0, "step": 1729 }, { "epoch": 2.1572052401746724, "grad_norm": 0.11993178026046523, "learning_rate": 2.7439775883322228e-06, "loss": 0.0293, "num_tokens": 131861208.0, "step": 1730 }, { "epoch": 2.158452900810979, "grad_norm": 0.11896934214964502, "learning_rate": 2.739189874798639e-06, "loss": 0.0302, "num_tokens": 131937674.0, "step": 1731 }, { "epoch": 2.1597005614472864, "grad_norm": 0.13073507570615797, "learning_rate": 2.7344071674482874e-06, "loss": 0.029, "num_tokens": 132013613.0, "step": 1732 }, { "epoch": 2.160948222083593, "grad_norm": 0.12538080148618927, "learning_rate": 2.729629474953662e-06, "loss": 0.0299, "num_tokens": 132089741.0, "step": 1733 }, { "epoch": 2.1621958827199004, "grad_norm": 0.11470530230982613, "learning_rate": 2.7248568059781654e-06, "loss": 0.0283, "num_tokens": 132164908.0, "step": 1734 }, { "epoch": 2.163443543356207, "grad_norm": 0.12364100713980067, "learning_rate": 2.7200891691760838e-06, "loss": 0.0294, "num_tokens": 132242419.0, "step": 1735 }, { "epoch": 2.164691203992514, "grad_norm": 0.1124677553566527, "learning_rate": 2.715326573192588e-06, "loss": 0.0286, "num_tokens": 132318249.0, "step": 1736 }, { "epoch": 2.165938864628821, "grad_norm": 0.1297173190208866, "learning_rate": 2.710569026663702e-06, "loss": 0.0303, "num_tokens": 132394392.0, "step": 1737 }, { "epoch": 2.167186525265128, "grad_norm": 0.11636204301816074, "learning_rate": 2.705816538216296e-06, "loss": 0.0286, "num_tokens": 132471185.0, "step": 1738 }, { "epoch": 2.1684341859014347, "grad_norm": 0.11967082027832546, "learning_rate": 2.7010691164680696e-06, "loss": 0.0291, "num_tokens": 132547646.0, "step": 1739 }, { "epoch": 2.169681846537742, "grad_norm": 0.11365392254175426, "learning_rate": 2.696326770027533e-06, "loss": 0.0294, "num_tokens": 132622649.0, "step": 1740 }, { "epoch": 2.1709295071740486, "grad_norm": 0.13542950967226994, "learning_rate": 2.6915895074939912e-06, "loss": 0.0304, "num_tokens": 132699810.0, "step": 1741 }, { "epoch": 2.1721771678103554, "grad_norm": 0.1149886342950311, "learning_rate": 2.6868573374575356e-06, "loss": 0.0296, "num_tokens": 132775540.0, "step": 1742 }, { "epoch": 2.1734248284466626, "grad_norm": 0.13013765441316502, "learning_rate": 2.6821302684990204e-06, "loss": 0.0297, "num_tokens": 132851557.0, "step": 1743 }, { "epoch": 2.1746724890829694, "grad_norm": 0.12482263669044058, "learning_rate": 2.677408309190049e-06, "loss": 0.0292, "num_tokens": 132927251.0, "step": 1744 }, { "epoch": 2.175920149719276, "grad_norm": 0.12082906222739927, "learning_rate": 2.672691468092963e-06, "loss": 0.029, "num_tokens": 133004097.0, "step": 1745 }, { "epoch": 2.1771678103555834, "grad_norm": 0.11920653050743499, "learning_rate": 2.6679797537608184e-06, "loss": 0.0289, "num_tokens": 133079389.0, "step": 1746 }, { "epoch": 2.17841547099189, "grad_norm": 0.11613342645530199, "learning_rate": 2.6632731747373785e-06, "loss": 0.0288, "num_tokens": 133155198.0, "step": 1747 }, { "epoch": 2.179663131628197, "grad_norm": 0.123898197898323, "learning_rate": 2.658571739557096e-06, "loss": 0.0292, "num_tokens": 133231097.0, "step": 1748 }, { "epoch": 2.180910792264504, "grad_norm": 0.12283293422724019, "learning_rate": 2.653875456745092e-06, "loss": 0.0294, "num_tokens": 133308155.0, "step": 1749 }, { "epoch": 2.182158452900811, "grad_norm": 0.11096661481389018, "learning_rate": 2.6491843348171455e-06, "loss": 0.0285, "num_tokens": 133384157.0, "step": 1750 }, { "epoch": 2.183406113537118, "grad_norm": 0.1323212456611913, "learning_rate": 2.644498382279681e-06, "loss": 0.0317, "num_tokens": 133461262.0, "step": 1751 }, { "epoch": 2.184653774173425, "grad_norm": 0.11301431248801659, "learning_rate": 2.639817607629745e-06, "loss": 0.0293, "num_tokens": 133538603.0, "step": 1752 }, { "epoch": 2.1859014348097316, "grad_norm": 0.12487264376417616, "learning_rate": 2.635142019354995e-06, "loss": 0.0298, "num_tokens": 133614939.0, "step": 1753 }, { "epoch": 2.187149095446039, "grad_norm": 0.12391443914690654, "learning_rate": 2.6304716259336903e-06, "loss": 0.0301, "num_tokens": 133691509.0, "step": 1754 }, { "epoch": 2.1883967560823456, "grad_norm": 0.11436807139379579, "learning_rate": 2.6258064358346642e-06, "loss": 0.0293, "num_tokens": 133769447.0, "step": 1755 }, { "epoch": 2.1896444167186524, "grad_norm": 0.11605243292138384, "learning_rate": 2.621146457517314e-06, "loss": 0.0298, "num_tokens": 133845741.0, "step": 1756 }, { "epoch": 2.1908920773549596, "grad_norm": 0.12762818150692948, "learning_rate": 2.6164916994315916e-06, "loss": 0.0293, "num_tokens": 133922231.0, "step": 1757 }, { "epoch": 2.1921397379912664, "grad_norm": 0.10934869459664252, "learning_rate": 2.6118421700179795e-06, "loss": 0.0286, "num_tokens": 133997898.0, "step": 1758 }, { "epoch": 2.193387398627573, "grad_norm": 0.11556111476881359, "learning_rate": 2.6071978777074796e-06, "loss": 0.0285, "num_tokens": 134074139.0, "step": 1759 }, { "epoch": 2.1946350592638804, "grad_norm": 0.11684215494013445, "learning_rate": 2.6025588309215975e-06, "loss": 0.0288, "num_tokens": 134151443.0, "step": 1760 }, { "epoch": 2.195882719900187, "grad_norm": 0.10859546600803964, "learning_rate": 2.5979250380723287e-06, "loss": 0.0291, "num_tokens": 134228671.0, "step": 1761 }, { "epoch": 2.197130380536494, "grad_norm": 0.1184786660169172, "learning_rate": 2.5932965075621376e-06, "loss": 0.0295, "num_tokens": 134305328.0, "step": 1762 }, { "epoch": 2.198378041172801, "grad_norm": 0.11694769918714469, "learning_rate": 2.5886732477839514e-06, "loss": 0.0287, "num_tokens": 134381607.0, "step": 1763 }, { "epoch": 2.199625701809108, "grad_norm": 0.10869599540703445, "learning_rate": 2.584055267121137e-06, "loss": 0.0285, "num_tokens": 134457944.0, "step": 1764 }, { "epoch": 2.2008733624454146, "grad_norm": 0.11388521689730151, "learning_rate": 2.579442573947488e-06, "loss": 0.0292, "num_tokens": 134534177.0, "step": 1765 }, { "epoch": 2.202121023081722, "grad_norm": 0.12019491039099481, "learning_rate": 2.5748351766272127e-06, "loss": 0.0324, "num_tokens": 134610985.0, "step": 1766 }, { "epoch": 2.2033686837180286, "grad_norm": 0.12572287691368844, "learning_rate": 2.5702330835149137e-06, "loss": 0.0302, "num_tokens": 134686523.0, "step": 1767 }, { "epoch": 2.204616344354336, "grad_norm": 0.11387811266227701, "learning_rate": 2.5656363029555788e-06, "loss": 0.029, "num_tokens": 134762744.0, "step": 1768 }, { "epoch": 2.2058640049906426, "grad_norm": 0.13846833190116084, "learning_rate": 2.561044843284558e-06, "loss": 0.0296, "num_tokens": 134839137.0, "step": 1769 }, { "epoch": 2.2071116656269494, "grad_norm": 0.1555816479613367, "learning_rate": 2.556458712827558e-06, "loss": 0.0299, "num_tokens": 134914750.0, "step": 1770 }, { "epoch": 2.2083593262632566, "grad_norm": 0.11209797843880384, "learning_rate": 2.551877919900619e-06, "loss": 0.0289, "num_tokens": 134990474.0, "step": 1771 }, { "epoch": 2.2096069868995634, "grad_norm": 0.11661372301986613, "learning_rate": 2.5473024728101004e-06, "loss": 0.0294, "num_tokens": 135066743.0, "step": 1772 }, { "epoch": 2.21085464753587, "grad_norm": 0.12963541577443485, "learning_rate": 2.5427323798526747e-06, "loss": 0.0317, "num_tokens": 135142093.0, "step": 1773 }, { "epoch": 2.2121023081721773, "grad_norm": 0.11731994857913773, "learning_rate": 2.538167649315298e-06, "loss": 0.0281, "num_tokens": 135217549.0, "step": 1774 }, { "epoch": 2.213349968808484, "grad_norm": 0.11727497789837275, "learning_rate": 2.5336082894752084e-06, "loss": 0.028, "num_tokens": 135292208.0, "step": 1775 }, { "epoch": 2.214597629444791, "grad_norm": 0.10924884561562678, "learning_rate": 2.529054308599906e-06, "loss": 0.0285, "num_tokens": 135368860.0, "step": 1776 }, { "epoch": 2.215845290081098, "grad_norm": 0.1237241496137088, "learning_rate": 2.524505714947131e-06, "loss": 0.0302, "num_tokens": 135445704.0, "step": 1777 }, { "epoch": 2.217092950717405, "grad_norm": 0.14377566320772303, "learning_rate": 2.5199625167648576e-06, "loss": 0.0334, "num_tokens": 135521961.0, "step": 1778 }, { "epoch": 2.2183406113537116, "grad_norm": 0.11347892313850486, "learning_rate": 2.515424722291282e-06, "loss": 0.0283, "num_tokens": 135596262.0, "step": 1779 }, { "epoch": 2.219588271990019, "grad_norm": 0.12225004311131818, "learning_rate": 2.5108923397547934e-06, "loss": 0.0298, "num_tokens": 135673824.0, "step": 1780 }, { "epoch": 2.2208359326263256, "grad_norm": 0.11499506956311069, "learning_rate": 2.5063653773739705e-06, "loss": 0.0294, "num_tokens": 135749231.0, "step": 1781 }, { "epoch": 2.2220835932626324, "grad_norm": 0.1221359406411641, "learning_rate": 2.501843843357568e-06, "loss": 0.0291, "num_tokens": 135825518.0, "step": 1782 }, { "epoch": 2.2233312538989396, "grad_norm": 0.11102792400168941, "learning_rate": 2.4973277459044927e-06, "loss": 0.0288, "num_tokens": 135900872.0, "step": 1783 }, { "epoch": 2.2245789145352464, "grad_norm": 0.11593699845823963, "learning_rate": 2.4928170932037916e-06, "loss": 0.0288, "num_tokens": 135977380.0, "step": 1784 }, { "epoch": 2.225826575171553, "grad_norm": 0.1201879734680093, "learning_rate": 2.4883118934346446e-06, "loss": 0.0307, "num_tokens": 136052351.0, "step": 1785 }, { "epoch": 2.2270742358078603, "grad_norm": 0.11414556232554415, "learning_rate": 2.48381215476634e-06, "loss": 0.0287, "num_tokens": 136128645.0, "step": 1786 }, { "epoch": 2.228321896444167, "grad_norm": 0.11889689490126032, "learning_rate": 2.4793178853582624e-06, "loss": 0.0277, "num_tokens": 136207824.0, "step": 1787 }, { "epoch": 2.229569557080474, "grad_norm": 0.12178468802022104, "learning_rate": 2.474829093359881e-06, "loss": 0.0286, "num_tokens": 136283777.0, "step": 1788 }, { "epoch": 2.230817217716781, "grad_norm": 0.11219961060063727, "learning_rate": 2.4703457869107346e-06, "loss": 0.0284, "num_tokens": 136359801.0, "step": 1789 }, { "epoch": 2.232064878353088, "grad_norm": 0.11319967454171764, "learning_rate": 2.4658679741404106e-06, "loss": 0.0283, "num_tokens": 136436800.0, "step": 1790 }, { "epoch": 2.233312538989395, "grad_norm": 0.10577538482285624, "learning_rate": 2.461395663168539e-06, "loss": 0.0284, "num_tokens": 136512719.0, "step": 1791 }, { "epoch": 2.234560199625702, "grad_norm": 0.11201254599703794, "learning_rate": 2.4569288621047704e-06, "loss": 0.0276, "num_tokens": 136588384.0, "step": 1792 }, { "epoch": 2.2358078602620086, "grad_norm": 0.13162060846047863, "learning_rate": 2.452467579048764e-06, "loss": 0.0289, "num_tokens": 136665095.0, "step": 1793 }, { "epoch": 2.237055520898316, "grad_norm": 0.1139883066704449, "learning_rate": 2.4480118220901764e-06, "loss": 0.0284, "num_tokens": 136741352.0, "step": 1794 }, { "epoch": 2.2383031815346226, "grad_norm": 0.11192665390675736, "learning_rate": 2.4435615993086414e-06, "loss": 0.0284, "num_tokens": 136817373.0, "step": 1795 }, { "epoch": 2.2395508421709294, "grad_norm": 0.11005654494213833, "learning_rate": 2.4391169187737555e-06, "loss": 0.0294, "num_tokens": 136892152.0, "step": 1796 }, { "epoch": 2.2407985028072366, "grad_norm": 0.11372508392162389, "learning_rate": 2.434677788545071e-06, "loss": 0.03, "num_tokens": 136967795.0, "step": 1797 }, { "epoch": 2.2420461634435433, "grad_norm": 0.11176146479955043, "learning_rate": 2.4302442166720723e-06, "loss": 0.029, "num_tokens": 137043516.0, "step": 1798 }, { "epoch": 2.24329382407985, "grad_norm": 0.1313817503178842, "learning_rate": 2.4258162111941634e-06, "loss": 0.0323, "num_tokens": 137119810.0, "step": 1799 }, { "epoch": 2.2445414847161573, "grad_norm": 0.11693831324465238, "learning_rate": 2.42139378014066e-06, "loss": 0.0283, "num_tokens": 137195390.0, "step": 1800 }, { "epoch": 2.245789145352464, "grad_norm": 0.10613607438236099, "learning_rate": 2.416976931530764e-06, "loss": 0.0279, "num_tokens": 137270115.0, "step": 1801 }, { "epoch": 2.247036805988771, "grad_norm": 0.12911652294793682, "learning_rate": 2.4125656733735554e-06, "loss": 0.0299, "num_tokens": 137347278.0, "step": 1802 }, { "epoch": 2.248284466625078, "grad_norm": 0.1118436847971541, "learning_rate": 2.4081600136679805e-06, "loss": 0.0293, "num_tokens": 137422667.0, "step": 1803 }, { "epoch": 2.249532127261385, "grad_norm": 0.12017153749591643, "learning_rate": 2.403759960402834e-06, "loss": 0.0306, "num_tokens": 137499970.0, "step": 1804 }, { "epoch": 2.2507797878976916, "grad_norm": 0.11762476655558339, "learning_rate": 2.39936552155674e-06, "loss": 0.0285, "num_tokens": 137577581.0, "step": 1805 }, { "epoch": 2.252027448533999, "grad_norm": 0.1257011326248305, "learning_rate": 2.394976705098143e-06, "loss": 0.0293, "num_tokens": 137653132.0, "step": 1806 }, { "epoch": 2.2532751091703056, "grad_norm": 0.12517247918878088, "learning_rate": 2.3905935189852967e-06, "loss": 0.0301, "num_tokens": 137730362.0, "step": 1807 }, { "epoch": 2.254522769806613, "grad_norm": 0.11159522749229389, "learning_rate": 2.386215971166242e-06, "loss": 0.029, "num_tokens": 137806677.0, "step": 1808 }, { "epoch": 2.2557704304429196, "grad_norm": 0.1148546738045, "learning_rate": 2.381844069578793e-06, "loss": 0.0286, "num_tokens": 137882602.0, "step": 1809 }, { "epoch": 2.2570180910792264, "grad_norm": 0.11440912221708367, "learning_rate": 2.3774778221505316e-06, "loss": 0.0283, "num_tokens": 137958543.0, "step": 1810 }, { "epoch": 2.2582657517155336, "grad_norm": 0.1306981591104985, "learning_rate": 2.3731172367987856e-06, "loss": 0.046, "num_tokens": 138034225.0, "step": 1811 }, { "epoch": 2.2595134123518403, "grad_norm": 0.11196369587951115, "learning_rate": 2.3687623214306096e-06, "loss": 0.0289, "num_tokens": 138109276.0, "step": 1812 }, { "epoch": 2.260761072988147, "grad_norm": 0.11481976617907198, "learning_rate": 2.364413083942787e-06, "loss": 0.0286, "num_tokens": 138185059.0, "step": 1813 }, { "epoch": 2.2620087336244543, "grad_norm": 0.1257854519258705, "learning_rate": 2.3600695322217965e-06, "loss": 0.0302, "num_tokens": 138262086.0, "step": 1814 }, { "epoch": 2.263256394260761, "grad_norm": 0.12369190094856691, "learning_rate": 2.355731674143809e-06, "loss": 0.0301, "num_tokens": 138338212.0, "step": 1815 }, { "epoch": 2.264504054897068, "grad_norm": 0.12247698026358372, "learning_rate": 2.3513995175746757e-06, "loss": 0.0291, "num_tokens": 138414401.0, "step": 1816 }, { "epoch": 2.265751715533375, "grad_norm": 0.1271346776570538, "learning_rate": 2.3470730703699034e-06, "loss": 0.0297, "num_tokens": 138491333.0, "step": 1817 }, { "epoch": 2.266999376169682, "grad_norm": 0.11370340212148224, "learning_rate": 2.3427523403746496e-06, "loss": 0.0291, "num_tokens": 138567428.0, "step": 1818 }, { "epoch": 2.2682470368059886, "grad_norm": 0.11295824445642685, "learning_rate": 2.338437335423705e-06, "loss": 0.0288, "num_tokens": 138643908.0, "step": 1819 }, { "epoch": 2.269494697442296, "grad_norm": 0.11800330007227519, "learning_rate": 2.3341280633414763e-06, "loss": 0.0296, "num_tokens": 138718747.0, "step": 1820 }, { "epoch": 2.2707423580786026, "grad_norm": 0.11949774975707976, "learning_rate": 2.3298245319419755e-06, "loss": 0.0294, "num_tokens": 138795371.0, "step": 1821 }, { "epoch": 2.2719900187149094, "grad_norm": 0.11244072126302489, "learning_rate": 2.325526749028808e-06, "loss": 0.0287, "num_tokens": 138870711.0, "step": 1822 }, { "epoch": 2.2732376793512166, "grad_norm": 0.10880477678477739, "learning_rate": 2.321234722395152e-06, "loss": 0.0285, "num_tokens": 138946056.0, "step": 1823 }, { "epoch": 2.2744853399875233, "grad_norm": 0.12263235673529867, "learning_rate": 2.3169484598237484e-06, "loss": 0.0295, "num_tokens": 139022433.0, "step": 1824 }, { "epoch": 2.2757330006238305, "grad_norm": 0.11275334804297876, "learning_rate": 2.312667969086887e-06, "loss": 0.0291, "num_tokens": 139098812.0, "step": 1825 }, { "epoch": 2.2769806612601373, "grad_norm": 0.12780973779129554, "learning_rate": 2.308393257946393e-06, "loss": 0.0293, "num_tokens": 139174520.0, "step": 1826 }, { "epoch": 2.278228321896444, "grad_norm": 0.11873970508053847, "learning_rate": 2.304124334153608e-06, "loss": 0.0298, "num_tokens": 139251494.0, "step": 1827 }, { "epoch": 2.279475982532751, "grad_norm": 0.12524822945833192, "learning_rate": 2.2998612054493827e-06, "loss": 0.0293, "num_tokens": 139327208.0, "step": 1828 }, { "epoch": 2.280723643169058, "grad_norm": 0.12246702316926837, "learning_rate": 2.2956038795640573e-06, "loss": 0.0289, "num_tokens": 139402782.0, "step": 1829 }, { "epoch": 2.281971303805365, "grad_norm": 0.11636388498884984, "learning_rate": 2.291352364217449e-06, "loss": 0.0293, "num_tokens": 139479741.0, "step": 1830 }, { "epoch": 2.283218964441672, "grad_norm": 0.11228895431486227, "learning_rate": 2.287106667118841e-06, "loss": 0.0283, "num_tokens": 139554910.0, "step": 1831 }, { "epoch": 2.284466625077979, "grad_norm": 0.12324954885696872, "learning_rate": 2.2828667959669674e-06, "loss": 0.0298, "num_tokens": 139632084.0, "step": 1832 }, { "epoch": 2.2857142857142856, "grad_norm": 0.11630414434749232, "learning_rate": 2.2786327584499944e-06, "loss": 0.0286, "num_tokens": 139708348.0, "step": 1833 }, { "epoch": 2.286961946350593, "grad_norm": 0.11645485108839593, "learning_rate": 2.2744045622455112e-06, "loss": 0.0297, "num_tokens": 139785741.0, "step": 1834 }, { "epoch": 2.2882096069868996, "grad_norm": 0.12845198074631714, "learning_rate": 2.270182215020517e-06, "loss": 0.0296, "num_tokens": 139861955.0, "step": 1835 }, { "epoch": 2.2894572676232063, "grad_norm": 0.11029976660171407, "learning_rate": 2.2659657244314017e-06, "loss": 0.0285, "num_tokens": 139938627.0, "step": 1836 }, { "epoch": 2.2907049282595136, "grad_norm": 0.11686652041244248, "learning_rate": 2.26175509812394e-06, "loss": 0.0291, "num_tokens": 140016141.0, "step": 1837 }, { "epoch": 2.2919525888958203, "grad_norm": 0.1125996988495152, "learning_rate": 2.2575503437332677e-06, "loss": 0.0291, "num_tokens": 140091465.0, "step": 1838 }, { "epoch": 2.293200249532127, "grad_norm": 0.11916698980967634, "learning_rate": 2.2533514688838755e-06, "loss": 0.0295, "num_tokens": 140168133.0, "step": 1839 }, { "epoch": 2.2944479101684343, "grad_norm": 0.11855479850727757, "learning_rate": 2.2491584811895927e-06, "loss": 0.0306, "num_tokens": 140245782.0, "step": 1840 }, { "epoch": 2.295695570804741, "grad_norm": 0.1135002870486832, "learning_rate": 2.244971388253576e-06, "loss": 0.029, "num_tokens": 140321664.0, "step": 1841 }, { "epoch": 2.2969432314410483, "grad_norm": 0.12754892462978165, "learning_rate": 2.2407901976682884e-06, "loss": 0.0287, "num_tokens": 140398059.0, "step": 1842 }, { "epoch": 2.298190892077355, "grad_norm": 0.10743682818469019, "learning_rate": 2.2366149170154907e-06, "loss": 0.0289, "num_tokens": 140474424.0, "step": 1843 }, { "epoch": 2.299438552713662, "grad_norm": 0.1085712382920245, "learning_rate": 2.232445553866231e-06, "loss": 0.0282, "num_tokens": 140548741.0, "step": 1844 }, { "epoch": 2.3006862133499686, "grad_norm": 0.11188206978220724, "learning_rate": 2.228282115780824e-06, "loss": 0.0285, "num_tokens": 140626013.0, "step": 1845 }, { "epoch": 2.301933873986276, "grad_norm": 0.11573511063173106, "learning_rate": 2.22412461030884e-06, "loss": 0.0295, "num_tokens": 140702145.0, "step": 1846 }, { "epoch": 2.3031815346225826, "grad_norm": 0.11974119831273712, "learning_rate": 2.2199730449890964e-06, "loss": 0.029, "num_tokens": 140777410.0, "step": 1847 }, { "epoch": 2.30442919525889, "grad_norm": 0.12021299737556257, "learning_rate": 2.215827427349635e-06, "loss": 0.0292, "num_tokens": 140853428.0, "step": 1848 }, { "epoch": 2.3056768558951966, "grad_norm": 0.12185882384047894, "learning_rate": 2.211687764907711e-06, "loss": 0.029, "num_tokens": 140929302.0, "step": 1849 }, { "epoch": 2.3069245165315033, "grad_norm": 0.12520481675496664, "learning_rate": 2.2075540651697873e-06, "loss": 0.0299, "num_tokens": 141007578.0, "step": 1850 }, { "epoch": 2.3081721771678105, "grad_norm": 0.12383609326667641, "learning_rate": 2.2034263356315087e-06, "loss": 0.0294, "num_tokens": 141084153.0, "step": 1851 }, { "epoch": 2.3094198378041173, "grad_norm": 0.11399711978377276, "learning_rate": 2.1993045837776957e-06, "loss": 0.0276, "num_tokens": 141161746.0, "step": 1852 }, { "epoch": 2.310667498440424, "grad_norm": 0.12307731559682583, "learning_rate": 2.195188817082331e-06, "loss": 0.0292, "num_tokens": 141236998.0, "step": 1853 }, { "epoch": 2.3119151590767313, "grad_norm": 0.12325195165579966, "learning_rate": 2.1910790430085465e-06, "loss": 0.0285, "num_tokens": 141313798.0, "step": 1854 }, { "epoch": 2.313162819713038, "grad_norm": 0.12353402922323736, "learning_rate": 2.1869752690086e-06, "loss": 0.0296, "num_tokens": 141391359.0, "step": 1855 }, { "epoch": 2.314410480349345, "grad_norm": 0.11990914943000852, "learning_rate": 2.1828775025238787e-06, "loss": 0.0299, "num_tokens": 141469951.0, "step": 1856 }, { "epoch": 2.315658140985652, "grad_norm": 0.11789036943836603, "learning_rate": 2.1787857509848693e-06, "loss": 0.0294, "num_tokens": 141546155.0, "step": 1857 }, { "epoch": 2.316905801621959, "grad_norm": 0.11643916471269543, "learning_rate": 2.174700021811153e-06, "loss": 0.029, "num_tokens": 141622896.0, "step": 1858 }, { "epoch": 2.3181534622582656, "grad_norm": 0.11095738581724128, "learning_rate": 2.1706203224113944e-06, "loss": 0.0286, "num_tokens": 141699035.0, "step": 1859 }, { "epoch": 2.319401122894573, "grad_norm": 0.12453893497423184, "learning_rate": 2.1665466601833197e-06, "loss": 0.03, "num_tokens": 141776543.0, "step": 1860 }, { "epoch": 2.3206487835308796, "grad_norm": 0.11909510325046865, "learning_rate": 2.162479042513711e-06, "loss": 0.0295, "num_tokens": 141852198.0, "step": 1861 }, { "epoch": 2.3218964441671863, "grad_norm": 0.11584200227594135, "learning_rate": 2.158417476778388e-06, "loss": 0.029, "num_tokens": 141926871.0, "step": 1862 }, { "epoch": 2.3231441048034935, "grad_norm": 0.11689424136269533, "learning_rate": 2.1543619703421975e-06, "loss": 0.0293, "num_tokens": 142003627.0, "step": 1863 }, { "epoch": 2.3243917654398003, "grad_norm": 0.12009870206278882, "learning_rate": 2.1503125305589976e-06, "loss": 0.0292, "num_tokens": 142079455.0, "step": 1864 }, { "epoch": 2.3256394260761075, "grad_norm": 0.11985517420175247, "learning_rate": 2.146269164771648e-06, "loss": 0.0298, "num_tokens": 142156920.0, "step": 1865 }, { "epoch": 2.3268870867124143, "grad_norm": 0.10358166580215797, "learning_rate": 2.142231880311992e-06, "loss": 0.0275, "num_tokens": 142232961.0, "step": 1866 }, { "epoch": 2.328134747348721, "grad_norm": 0.11680882154313593, "learning_rate": 2.1382006845008456e-06, "loss": 0.0299, "num_tokens": 142308930.0, "step": 1867 }, { "epoch": 2.329382407985028, "grad_norm": 0.11287699434889473, "learning_rate": 2.1341755846479868e-06, "loss": 0.0288, "num_tokens": 142385073.0, "step": 1868 }, { "epoch": 2.330630068621335, "grad_norm": 0.11922671998430709, "learning_rate": 2.1301565880521387e-06, "loss": 0.0299, "num_tokens": 142461069.0, "step": 1869 }, { "epoch": 2.331877729257642, "grad_norm": 0.11411670675268805, "learning_rate": 2.1261437020009565e-06, "loss": 0.0288, "num_tokens": 142536513.0, "step": 1870 }, { "epoch": 2.333125389893949, "grad_norm": 0.11086492410716439, "learning_rate": 2.122136933771014e-06, "loss": 0.0275, "num_tokens": 142613035.0, "step": 1871 }, { "epoch": 2.334373050530256, "grad_norm": 0.11355073632493284, "learning_rate": 2.118136290627795e-06, "loss": 0.0281, "num_tokens": 142687991.0, "step": 1872 }, { "epoch": 2.3356207111665626, "grad_norm": 0.13046047958341747, "learning_rate": 2.114141779825674e-06, "loss": 0.03, "num_tokens": 142765175.0, "step": 1873 }, { "epoch": 2.3368683718028698, "grad_norm": 0.12467814349298817, "learning_rate": 2.110153408607904e-06, "loss": 0.0289, "num_tokens": 142840996.0, "step": 1874 }, { "epoch": 2.3381160324391765, "grad_norm": 0.11966459047710178, "learning_rate": 2.1061711842066124e-06, "loss": 0.0296, "num_tokens": 142916895.0, "step": 1875 }, { "epoch": 2.3393636930754833, "grad_norm": 0.11610628096770173, "learning_rate": 2.1021951138427736e-06, "loss": 0.0289, "num_tokens": 142992632.0, "step": 1876 }, { "epoch": 2.3406113537117905, "grad_norm": 0.10611006288527419, "learning_rate": 2.0982252047262025e-06, "loss": 0.0289, "num_tokens": 143068924.0, "step": 1877 }, { "epoch": 2.3418590143480973, "grad_norm": 0.11261369317367498, "learning_rate": 2.094261464055548e-06, "loss": 0.0284, "num_tokens": 143144300.0, "step": 1878 }, { "epoch": 2.343106674984404, "grad_norm": 0.11413669262158226, "learning_rate": 2.0903038990182684e-06, "loss": 0.0286, "num_tokens": 143220726.0, "step": 1879 }, { "epoch": 2.3443543356207113, "grad_norm": 0.12312252563671286, "learning_rate": 2.086352516790624e-06, "loss": 0.0292, "num_tokens": 143296504.0, "step": 1880 }, { "epoch": 2.345601996257018, "grad_norm": 0.12301300351069973, "learning_rate": 2.082407324537668e-06, "loss": 0.03, "num_tokens": 143373399.0, "step": 1881 }, { "epoch": 2.3468496568933253, "grad_norm": 0.11366967533685554, "learning_rate": 2.078468329413223e-06, "loss": 0.0284, "num_tokens": 143450300.0, "step": 1882 }, { "epoch": 2.348097317529632, "grad_norm": 0.12741351497399822, "learning_rate": 2.07453553855988e-06, "loss": 0.0298, "num_tokens": 143526458.0, "step": 1883 }, { "epoch": 2.349344978165939, "grad_norm": 0.1264248370411575, "learning_rate": 2.0706089591089785e-06, "loss": 0.0296, "num_tokens": 143602427.0, "step": 1884 }, { "epoch": 2.3505926388022456, "grad_norm": 0.11387889117213461, "learning_rate": 2.0666885981805916e-06, "loss": 0.0284, "num_tokens": 143679216.0, "step": 1885 }, { "epoch": 2.3518402994385528, "grad_norm": 0.1313321787677014, "learning_rate": 2.0627744628835196e-06, "loss": 0.0291, "num_tokens": 143755004.0, "step": 1886 }, { "epoch": 2.3530879600748595, "grad_norm": 0.11866227579559852, "learning_rate": 2.058866560315273e-06, "loss": 0.0284, "num_tokens": 143831363.0, "step": 1887 }, { "epoch": 2.3543356207111668, "grad_norm": 0.11693130482462902, "learning_rate": 2.054964897562061e-06, "loss": 0.0286, "num_tokens": 143906823.0, "step": 1888 }, { "epoch": 2.3555832813474735, "grad_norm": 0.12387166405220712, "learning_rate": 2.0510694816987724e-06, "loss": 0.0292, "num_tokens": 143982490.0, "step": 1889 }, { "epoch": 2.3568309419837803, "grad_norm": 0.11625853498164135, "learning_rate": 2.047180319788981e-06, "loss": 0.0293, "num_tokens": 144059126.0, "step": 1890 }, { "epoch": 2.3580786026200875, "grad_norm": 0.11795665021289878, "learning_rate": 2.0432974188849103e-06, "loss": 0.0294, "num_tokens": 144134816.0, "step": 1891 }, { "epoch": 2.3593262632563943, "grad_norm": 0.11653403992770552, "learning_rate": 2.0394207860274304e-06, "loss": 0.029, "num_tokens": 144210634.0, "step": 1892 }, { "epoch": 2.360573923892701, "grad_norm": 0.11661988331669815, "learning_rate": 2.035550428246053e-06, "loss": 0.0302, "num_tokens": 144286780.0, "step": 1893 }, { "epoch": 2.3618215845290083, "grad_norm": 0.10878502664004895, "learning_rate": 2.0316863525589037e-06, "loss": 0.028, "num_tokens": 144364570.0, "step": 1894 }, { "epoch": 2.363069245165315, "grad_norm": 0.11560765133919251, "learning_rate": 2.0278285659727187e-06, "loss": 0.0283, "num_tokens": 144438936.0, "step": 1895 }, { "epoch": 2.364316905801622, "grad_norm": 0.12027357703372889, "learning_rate": 2.023977075482833e-06, "loss": 0.0297, "num_tokens": 144515667.0, "step": 1896 }, { "epoch": 2.365564566437929, "grad_norm": 0.11720329585672999, "learning_rate": 2.0201318880731633e-06, "loss": 0.0289, "num_tokens": 144592784.0, "step": 1897 }, { "epoch": 2.3668122270742358, "grad_norm": 0.12262761391702148, "learning_rate": 2.0162930107161963e-06, "loss": 0.0287, "num_tokens": 144668333.0, "step": 1898 }, { "epoch": 2.3680598877105425, "grad_norm": 0.11209003719114731, "learning_rate": 2.012460450372976e-06, "loss": 0.0288, "num_tokens": 144744725.0, "step": 1899 }, { "epoch": 2.3693075483468498, "grad_norm": 0.12214495630165384, "learning_rate": 2.0086342139930932e-06, "loss": 0.0298, "num_tokens": 144821579.0, "step": 1900 }, { "epoch": 2.3705552089831565, "grad_norm": 0.12046060228472436, "learning_rate": 2.004814308514671e-06, "loss": 0.0287, "num_tokens": 144897323.0, "step": 1901 }, { "epoch": 2.3718028696194633, "grad_norm": 0.11069917239375722, "learning_rate": 2.001000740864353e-06, "loss": 0.0277, "num_tokens": 144972434.0, "step": 1902 }, { "epoch": 2.3730505302557705, "grad_norm": 0.11894361619187248, "learning_rate": 1.9971935179572893e-06, "loss": 0.0293, "num_tokens": 145048953.0, "step": 1903 }, { "epoch": 2.3742981908920773, "grad_norm": 0.11522118972823835, "learning_rate": 1.993392646697127e-06, "loss": 0.0287, "num_tokens": 145124171.0, "step": 1904 }, { "epoch": 2.3755458515283845, "grad_norm": 0.11993473456085262, "learning_rate": 1.9895981339759927e-06, "loss": 0.0298, "num_tokens": 145200923.0, "step": 1905 }, { "epoch": 2.3767935121646913, "grad_norm": 0.11659064385891797, "learning_rate": 1.985809986674487e-06, "loss": 0.0304, "num_tokens": 145277642.0, "step": 1906 }, { "epoch": 2.378041172800998, "grad_norm": 0.11233951997406662, "learning_rate": 1.982028211661665e-06, "loss": 0.0282, "num_tokens": 145354802.0, "step": 1907 }, { "epoch": 2.3792888334373052, "grad_norm": 0.12409125074153357, "learning_rate": 1.9782528157950266e-06, "loss": 0.0289, "num_tokens": 145430480.0, "step": 1908 }, { "epoch": 2.380536494073612, "grad_norm": 0.11627883738278946, "learning_rate": 1.974483805920508e-06, "loss": 0.0292, "num_tokens": 145507192.0, "step": 1909 }, { "epoch": 2.3817841547099188, "grad_norm": 0.12734639465992706, "learning_rate": 1.970721188872461e-06, "loss": 0.0302, "num_tokens": 145583500.0, "step": 1910 }, { "epoch": 2.383031815346226, "grad_norm": 0.11009739257647629, "learning_rate": 1.966964971473649e-06, "loss": 0.0282, "num_tokens": 145659305.0, "step": 1911 }, { "epoch": 2.3842794759825328, "grad_norm": 0.12085112973508644, "learning_rate": 1.9632151605352296e-06, "loss": 0.0288, "num_tokens": 145735646.0, "step": 1912 }, { "epoch": 2.3855271366188395, "grad_norm": 0.11074193233911775, "learning_rate": 1.9594717628567432e-06, "loss": 0.029, "num_tokens": 145810718.0, "step": 1913 }, { "epoch": 2.3867747972551467, "grad_norm": 0.12306001735547971, "learning_rate": 1.9557347852261007e-06, "loss": 0.0294, "num_tokens": 145886926.0, "step": 1914 }, { "epoch": 2.3880224578914535, "grad_norm": 0.11756178292941767, "learning_rate": 1.9520042344195727e-06, "loss": 0.0297, "num_tokens": 145963538.0, "step": 1915 }, { "epoch": 2.3892701185277603, "grad_norm": 0.12225103966132156, "learning_rate": 1.9482801172017758e-06, "loss": 0.0288, "num_tokens": 146039990.0, "step": 1916 }, { "epoch": 2.3905177791640675, "grad_norm": 0.11059553718727183, "learning_rate": 1.9445624403256576e-06, "loss": 0.0285, "num_tokens": 146116517.0, "step": 1917 }, { "epoch": 2.3917654398003743, "grad_norm": 0.12578612156742616, "learning_rate": 1.940851210532493e-06, "loss": 0.0298, "num_tokens": 146192937.0, "step": 1918 }, { "epoch": 2.393013100436681, "grad_norm": 0.11893920438902063, "learning_rate": 1.937146434551863e-06, "loss": 0.0283, "num_tokens": 146267536.0, "step": 1919 }, { "epoch": 2.3942607610729882, "grad_norm": 0.11149478865971485, "learning_rate": 1.933448119101644e-06, "loss": 0.0289, "num_tokens": 146344824.0, "step": 1920 }, { "epoch": 2.395508421709295, "grad_norm": 0.11882385786045513, "learning_rate": 1.929756270888003e-06, "loss": 0.0289, "num_tokens": 146421929.0, "step": 1921 }, { "epoch": 2.3967560823456022, "grad_norm": 0.10727470152443296, "learning_rate": 1.9260708966053744e-06, "loss": 0.0283, "num_tokens": 146497708.0, "step": 1922 }, { "epoch": 2.398003742981909, "grad_norm": 0.1094868205451024, "learning_rate": 1.9223920029364555e-06, "loss": 0.0283, "num_tokens": 146573739.0, "step": 1923 }, { "epoch": 2.3992514036182158, "grad_norm": 0.11691359954601226, "learning_rate": 1.9187195965521934e-06, "loss": 0.0293, "num_tokens": 146650698.0, "step": 1924 }, { "epoch": 2.4004990642545225, "grad_norm": 0.10662908498967849, "learning_rate": 1.9150536841117713e-06, "loss": 0.0286, "num_tokens": 146726679.0, "step": 1925 }, { "epoch": 2.4017467248908297, "grad_norm": 0.14014329804407294, "learning_rate": 1.911394272262595e-06, "loss": 0.0294, "num_tokens": 146804849.0, "step": 1926 }, { "epoch": 2.4029943855271365, "grad_norm": 0.12226614216324987, "learning_rate": 1.907741367640286e-06, "loss": 0.0299, "num_tokens": 146881226.0, "step": 1927 }, { "epoch": 2.4042420461634437, "grad_norm": 0.13585310316541752, "learning_rate": 1.9040949768686646e-06, "loss": 0.0305, "num_tokens": 146957259.0, "step": 1928 }, { "epoch": 2.4054897067997505, "grad_norm": 0.10302972876901556, "learning_rate": 1.900455106559737e-06, "loss": 0.0273, "num_tokens": 147031920.0, "step": 1929 }, { "epoch": 2.4067373674360573, "grad_norm": 0.1215937415585842, "learning_rate": 1.8968217633136909e-06, "loss": 0.0287, "num_tokens": 147109354.0, "step": 1930 }, { "epoch": 2.4079850280723645, "grad_norm": 0.11880054224376227, "learning_rate": 1.893194953718875e-06, "loss": 0.0287, "num_tokens": 147185080.0, "step": 1931 }, { "epoch": 2.4092326887086712, "grad_norm": 0.11355682893486546, "learning_rate": 1.8895746843517892e-06, "loss": 0.0291, "num_tokens": 147260147.0, "step": 1932 }, { "epoch": 2.410480349344978, "grad_norm": 0.1210669123668994, "learning_rate": 1.8859609617770786e-06, "loss": 0.0302, "num_tokens": 147336387.0, "step": 1933 }, { "epoch": 2.4117280099812852, "grad_norm": 0.11208579999965458, "learning_rate": 1.8823537925475143e-06, "loss": 0.0283, "num_tokens": 147411743.0, "step": 1934 }, { "epoch": 2.412975670617592, "grad_norm": 0.12438861535121437, "learning_rate": 1.8787531832039846e-06, "loss": 0.0294, "num_tokens": 147487099.0, "step": 1935 }, { "epoch": 2.4142233312538988, "grad_norm": 0.11521486060217191, "learning_rate": 1.8751591402754802e-06, "loss": 0.0295, "num_tokens": 147562576.0, "step": 1936 }, { "epoch": 2.415470991890206, "grad_norm": 0.12833118858246773, "learning_rate": 1.8715716702790903e-06, "loss": 0.029, "num_tokens": 147639867.0, "step": 1937 }, { "epoch": 2.4167186525265127, "grad_norm": 0.1232632765835214, "learning_rate": 1.8679907797199798e-06, "loss": 0.0289, "num_tokens": 147716321.0, "step": 1938 }, { "epoch": 2.41796631316282, "grad_norm": 0.12388893166951646, "learning_rate": 1.8644164750913868e-06, "loss": 0.029, "num_tokens": 147792320.0, "step": 1939 }, { "epoch": 2.4192139737991267, "grad_norm": 0.11673892717341924, "learning_rate": 1.8608487628746072e-06, "loss": 0.0288, "num_tokens": 147867833.0, "step": 1940 }, { "epoch": 2.4204616344354335, "grad_norm": 0.12016233695124004, "learning_rate": 1.8572876495389808e-06, "loss": 0.0289, "num_tokens": 147944585.0, "step": 1941 }, { "epoch": 2.4217092950717403, "grad_norm": 0.12686819493850526, "learning_rate": 1.8537331415418802e-06, "loss": 0.0294, "num_tokens": 148020225.0, "step": 1942 }, { "epoch": 2.4229569557080475, "grad_norm": 0.11875350996516942, "learning_rate": 1.8501852453287056e-06, "loss": 0.03, "num_tokens": 148096408.0, "step": 1943 }, { "epoch": 2.4242046163443542, "grad_norm": 0.12486101318734383, "learning_rate": 1.846643967332865e-06, "loss": 0.0284, "num_tokens": 148172670.0, "step": 1944 }, { "epoch": 2.4254522769806615, "grad_norm": 0.11059777297523263, "learning_rate": 1.8431093139757635e-06, "loss": 0.0277, "num_tokens": 148248603.0, "step": 1945 }, { "epoch": 2.4266999376169682, "grad_norm": 0.10968511209097837, "learning_rate": 1.8395812916667974e-06, "loss": 0.0275, "num_tokens": 148324109.0, "step": 1946 }, { "epoch": 2.427947598253275, "grad_norm": 0.12136214189843847, "learning_rate": 1.836059906803339e-06, "loss": 0.0288, "num_tokens": 148400040.0, "step": 1947 }, { "epoch": 2.429195258889582, "grad_norm": 0.1181035158906749, "learning_rate": 1.832545165770721e-06, "loss": 0.0281, "num_tokens": 148477225.0, "step": 1948 }, { "epoch": 2.430442919525889, "grad_norm": 0.1259344854755692, "learning_rate": 1.8290370749422327e-06, "loss": 0.0295, "num_tokens": 148554162.0, "step": 1949 }, { "epoch": 2.4316905801621957, "grad_norm": 0.11410632203785342, "learning_rate": 1.8255356406791036e-06, "loss": 0.0282, "num_tokens": 148629585.0, "step": 1950 }, { "epoch": 2.432938240798503, "grad_norm": 0.12856021094192358, "learning_rate": 1.82204086933049e-06, "loss": 0.0296, "num_tokens": 148706439.0, "step": 1951 }, { "epoch": 2.4341859014348097, "grad_norm": 0.11923685601983912, "learning_rate": 1.8185527672334712e-06, "loss": 0.0294, "num_tokens": 148781899.0, "step": 1952 }, { "epoch": 2.4354335620711165, "grad_norm": 0.12128121310656399, "learning_rate": 1.8150713407130283e-06, "loss": 0.0285, "num_tokens": 148857892.0, "step": 1953 }, { "epoch": 2.4366812227074237, "grad_norm": 0.11220293138970718, "learning_rate": 1.8115965960820414e-06, "loss": 0.0287, "num_tokens": 148933540.0, "step": 1954 }, { "epoch": 2.4379288833437305, "grad_norm": 0.12701814936674094, "learning_rate": 1.8081285396412738e-06, "loss": 0.0309, "num_tokens": 149009410.0, "step": 1955 }, { "epoch": 2.4391765439800372, "grad_norm": 0.12316860118736007, "learning_rate": 1.8046671776793584e-06, "loss": 0.0292, "num_tokens": 149085567.0, "step": 1956 }, { "epoch": 2.4404242046163445, "grad_norm": 0.12579245754407814, "learning_rate": 1.80121251647279e-06, "loss": 0.0293, "num_tokens": 149160818.0, "step": 1957 }, { "epoch": 2.4416718652526512, "grad_norm": 0.11864506441301292, "learning_rate": 1.7977645622859157e-06, "loss": 0.0295, "num_tokens": 149237095.0, "step": 1958 }, { "epoch": 2.442919525888958, "grad_norm": 0.12128068105962087, "learning_rate": 1.7943233213709173e-06, "loss": 0.0291, "num_tokens": 149313199.0, "step": 1959 }, { "epoch": 2.444167186525265, "grad_norm": 0.11867438858434401, "learning_rate": 1.7908887999678046e-06, "loss": 0.0286, "num_tokens": 149390816.0, "step": 1960 }, { "epoch": 2.445414847161572, "grad_norm": 0.10792932976220691, "learning_rate": 1.7874610043044027e-06, "loss": 0.0281, "num_tokens": 149466985.0, "step": 1961 }, { "epoch": 2.446662507797879, "grad_norm": 0.1561122611017201, "learning_rate": 1.7840399405963432e-06, "loss": 0.0346, "num_tokens": 149544590.0, "step": 1962 }, { "epoch": 2.447910168434186, "grad_norm": 0.12123511730152045, "learning_rate": 1.7806256150470472e-06, "loss": 0.0286, "num_tokens": 149619730.0, "step": 1963 }, { "epoch": 2.4491578290704927, "grad_norm": 0.10870931536325534, "learning_rate": 1.7772180338477173e-06, "loss": 0.0281, "num_tokens": 149694700.0, "step": 1964 }, { "epoch": 2.4504054897068, "grad_norm": 0.11432026693551328, "learning_rate": 1.7738172031773322e-06, "loss": 0.0288, "num_tokens": 149770875.0, "step": 1965 }, { "epoch": 2.4516531503431067, "grad_norm": 0.1237259018436631, "learning_rate": 1.7704231292026219e-06, "loss": 0.0286, "num_tokens": 149847328.0, "step": 1966 }, { "epoch": 2.4529008109794135, "grad_norm": 0.12251333691455961, "learning_rate": 1.76703581807807e-06, "loss": 0.0283, "num_tokens": 149923014.0, "step": 1967 }, { "epoch": 2.4541484716157207, "grad_norm": 0.13592170557138702, "learning_rate": 1.7636552759458963e-06, "loss": 0.0295, "num_tokens": 150000726.0, "step": 1968 }, { "epoch": 2.4553961322520275, "grad_norm": 0.10644631287643523, "learning_rate": 1.760281508936045e-06, "loss": 0.029, "num_tokens": 150076628.0, "step": 1969 }, { "epoch": 2.4566437928883342, "grad_norm": 0.11637380854434433, "learning_rate": 1.7569145231661738e-06, "loss": 0.029, "num_tokens": 150153366.0, "step": 1970 }, { "epoch": 2.4578914535246414, "grad_norm": 0.11721908963690945, "learning_rate": 1.753554324741648e-06, "loss": 0.0288, "num_tokens": 150229286.0, "step": 1971 }, { "epoch": 2.459139114160948, "grad_norm": 0.11861330158718068, "learning_rate": 1.7502009197555215e-06, "loss": 0.0283, "num_tokens": 150304975.0, "step": 1972 }, { "epoch": 2.460386774797255, "grad_norm": 0.11613107309040814, "learning_rate": 1.7468543142885308e-06, "loss": 0.0288, "num_tokens": 150380574.0, "step": 1973 }, { "epoch": 2.461634435433562, "grad_norm": 0.11791945836868803, "learning_rate": 1.7435145144090852e-06, "loss": 0.0297, "num_tokens": 150456868.0, "step": 1974 }, { "epoch": 2.462882096069869, "grad_norm": 0.10865521562257276, "learning_rate": 1.740181526173248e-06, "loss": 0.028, "num_tokens": 150532783.0, "step": 1975 }, { "epoch": 2.4641297567061757, "grad_norm": 0.12013505293295967, "learning_rate": 1.736855355624737e-06, "loss": 0.0289, "num_tokens": 150608207.0, "step": 1976 }, { "epoch": 2.465377417342483, "grad_norm": 0.1190130952026185, "learning_rate": 1.7335360087949048e-06, "loss": 0.0291, "num_tokens": 150683923.0, "step": 1977 }, { "epoch": 2.4666250779787897, "grad_norm": 0.12021618470381827, "learning_rate": 1.73022349170273e-06, "loss": 0.0295, "num_tokens": 150759319.0, "step": 1978 }, { "epoch": 2.467872738615097, "grad_norm": 0.1188294278109119, "learning_rate": 1.7269178103548057e-06, "loss": 0.0298, "num_tokens": 150836584.0, "step": 1979 }, { "epoch": 2.4691203992514037, "grad_norm": 0.11623486387948447, "learning_rate": 1.723618970745334e-06, "loss": 0.0293, "num_tokens": 150911549.0, "step": 1980 }, { "epoch": 2.4703680598877105, "grad_norm": 0.1176345828523494, "learning_rate": 1.7203269788561067e-06, "loss": 0.0291, "num_tokens": 150987402.0, "step": 1981 }, { "epoch": 2.4716157205240172, "grad_norm": 0.1170956124313601, "learning_rate": 1.7170418406564982e-06, "loss": 0.0288, "num_tokens": 151065490.0, "step": 1982 }, { "epoch": 2.4728633811603244, "grad_norm": 0.11242037500890412, "learning_rate": 1.7137635621034614e-06, "loss": 0.0283, "num_tokens": 151140755.0, "step": 1983 }, { "epoch": 2.474111041796631, "grad_norm": 0.12957828390475865, "learning_rate": 1.7104921491415038e-06, "loss": 0.0305, "num_tokens": 151219039.0, "step": 1984 }, { "epoch": 2.4753587024329384, "grad_norm": 0.1268773509148947, "learning_rate": 1.7072276077026856e-06, "loss": 0.0295, "num_tokens": 151296866.0, "step": 1985 }, { "epoch": 2.476606363069245, "grad_norm": 0.11827505548209889, "learning_rate": 1.7039699437066076e-06, "loss": 0.0289, "num_tokens": 151374496.0, "step": 1986 }, { "epoch": 2.477854023705552, "grad_norm": 0.12394239832221975, "learning_rate": 1.7007191630604003e-06, "loss": 0.0297, "num_tokens": 151449768.0, "step": 1987 }, { "epoch": 2.479101684341859, "grad_norm": 0.10729321452712806, "learning_rate": 1.6974752716587092e-06, "loss": 0.0281, "num_tokens": 151525242.0, "step": 1988 }, { "epoch": 2.480349344978166, "grad_norm": 0.11244564865318404, "learning_rate": 1.6942382753836912e-06, "loss": 0.0276, "num_tokens": 151600019.0, "step": 1989 }, { "epoch": 2.4815970056144727, "grad_norm": 0.11260476703513686, "learning_rate": 1.691008180105e-06, "loss": 0.0286, "num_tokens": 151676641.0, "step": 1990 }, { "epoch": 2.48284466625078, "grad_norm": 0.11508491747273103, "learning_rate": 1.6877849916797728e-06, "loss": 0.0287, "num_tokens": 151752758.0, "step": 1991 }, { "epoch": 2.4840923268870867, "grad_norm": 0.12295722659763934, "learning_rate": 1.684568715952626e-06, "loss": 0.0292, "num_tokens": 151828614.0, "step": 1992 }, { "epoch": 2.4853399875233935, "grad_norm": 0.11201936145104541, "learning_rate": 1.6813593587556392e-06, "loss": 0.0289, "num_tokens": 151904503.0, "step": 1993 }, { "epoch": 2.4865876481597007, "grad_norm": 0.11197536265010408, "learning_rate": 1.6781569259083463e-06, "loss": 0.0281, "num_tokens": 151980779.0, "step": 1994 }, { "epoch": 2.4878353087960074, "grad_norm": 0.127443601047076, "learning_rate": 1.6749614232177273e-06, "loss": 0.0294, "num_tokens": 152057218.0, "step": 1995 }, { "epoch": 2.489082969432314, "grad_norm": 0.1272175699987119, "learning_rate": 1.6717728564781927e-06, "loss": 0.0297, "num_tokens": 152134969.0, "step": 1996 }, { "epoch": 2.4903306300686214, "grad_norm": 0.11830686372514893, "learning_rate": 1.6685912314715797e-06, "loss": 0.0286, "num_tokens": 152209799.0, "step": 1997 }, { "epoch": 2.491578290704928, "grad_norm": 0.11785531730640933, "learning_rate": 1.6654165539671342e-06, "loss": 0.0292, "num_tokens": 152286463.0, "step": 1998 }, { "epoch": 2.492825951341235, "grad_norm": 0.11630082166589754, "learning_rate": 1.6622488297215079e-06, "loss": 0.0281, "num_tokens": 152362651.0, "step": 1999 }, { "epoch": 2.494073611977542, "grad_norm": 0.1147252131365444, "learning_rate": 1.6590880644787407e-06, "loss": 0.0274, "num_tokens": 152439284.0, "step": 2000 }, { "epoch": 2.495321272613849, "grad_norm": 0.10890875650452751, "learning_rate": 1.6559342639702563e-06, "loss": 0.0285, "num_tokens": 152516175.0, "step": 2001 }, { "epoch": 2.496568933250156, "grad_norm": 0.11369281469621417, "learning_rate": 1.6527874339148484e-06, "loss": 0.0281, "num_tokens": 152591844.0, "step": 2002 }, { "epoch": 2.497816593886463, "grad_norm": 0.11765443898571652, "learning_rate": 1.6496475800186702e-06, "loss": 0.0292, "num_tokens": 152669310.0, "step": 2003 }, { "epoch": 2.4990642545227697, "grad_norm": 0.12392954052131615, "learning_rate": 1.6465147079752264e-06, "loss": 0.0297, "num_tokens": 152745302.0, "step": 2004 }, { "epoch": 2.5003119151590765, "grad_norm": 0.10811929633453872, "learning_rate": 1.6433888234653614e-06, "loss": 0.0279, "num_tokens": 152821134.0, "step": 2005 }, { "epoch": 2.5015595757953837, "grad_norm": 0.12800440714521488, "learning_rate": 1.6402699321572485e-06, "loss": 0.0289, "num_tokens": 152897262.0, "step": 2006 }, { "epoch": 2.5028072364316905, "grad_norm": 0.11428461694118386, "learning_rate": 1.6371580397063788e-06, "loss": 0.0282, "num_tokens": 152973605.0, "step": 2007 }, { "epoch": 2.5040548970679977, "grad_norm": 0.10820362542898414, "learning_rate": 1.6340531517555563e-06, "loss": 0.0283, "num_tokens": 153049729.0, "step": 2008 }, { "epoch": 2.5053025577043044, "grad_norm": 0.12644172316361868, "learning_rate": 1.6309552739348804e-06, "loss": 0.0293, "num_tokens": 153127475.0, "step": 2009 }, { "epoch": 2.506550218340611, "grad_norm": 0.10836425357174682, "learning_rate": 1.6278644118617375e-06, "loss": 0.0285, "num_tokens": 153204165.0, "step": 2010 }, { "epoch": 2.5077978789769184, "grad_norm": 0.12135651297840683, "learning_rate": 1.6247805711407993e-06, "loss": 0.0295, "num_tokens": 153280453.0, "step": 2011 }, { "epoch": 2.509045539613225, "grad_norm": 0.11582565410724034, "learning_rate": 1.6217037573639983e-06, "loss": 0.029, "num_tokens": 153357256.0, "step": 2012 }, { "epoch": 2.5102932002495324, "grad_norm": 0.11923075497477043, "learning_rate": 1.6186339761105275e-06, "loss": 0.0295, "num_tokens": 153434260.0, "step": 2013 }, { "epoch": 2.511540860885839, "grad_norm": 0.11545869189350899, "learning_rate": 1.6155712329468305e-06, "loss": 0.0281, "num_tokens": 153512382.0, "step": 2014 }, { "epoch": 2.512788521522146, "grad_norm": 0.1106551868324314, "learning_rate": 1.6125155334265846e-06, "loss": 0.0292, "num_tokens": 153587250.0, "step": 2015 }, { "epoch": 2.5140361821584527, "grad_norm": 0.11147702747990101, "learning_rate": 1.6094668830906959e-06, "loss": 0.0285, "num_tokens": 153661781.0, "step": 2016 }, { "epoch": 2.51528384279476, "grad_norm": 0.13288602885472603, "learning_rate": 1.6064252874672904e-06, "loss": 0.031, "num_tokens": 153737392.0, "step": 2017 }, { "epoch": 2.5165315034310667, "grad_norm": 0.11513263509992315, "learning_rate": 1.6033907520717008e-06, "loss": 0.0284, "num_tokens": 153813161.0, "step": 2018 }, { "epoch": 2.517779164067374, "grad_norm": 0.1144863171593377, "learning_rate": 1.6003632824064553e-06, "loss": 0.0288, "num_tokens": 153889389.0, "step": 2019 }, { "epoch": 2.5190268247036807, "grad_norm": 0.1073831605303728, "learning_rate": 1.5973428839612727e-06, "loss": 0.0279, "num_tokens": 153966333.0, "step": 2020 }, { "epoch": 2.5202744853399874, "grad_norm": 0.1205683735174724, "learning_rate": 1.5943295622130483e-06, "loss": 0.0277, "num_tokens": 154042117.0, "step": 2021 }, { "epoch": 2.521522145976294, "grad_norm": 0.11709366889958371, "learning_rate": 1.5913233226258437e-06, "loss": 0.0278, "num_tokens": 154117674.0, "step": 2022 }, { "epoch": 2.5227698066126014, "grad_norm": 0.12142255989860215, "learning_rate": 1.5883241706508823e-06, "loss": 0.0296, "num_tokens": 154195162.0, "step": 2023 }, { "epoch": 2.524017467248908, "grad_norm": 0.11351365164635996, "learning_rate": 1.5853321117265317e-06, "loss": 0.0283, "num_tokens": 154271186.0, "step": 2024 }, { "epoch": 2.5252651278852154, "grad_norm": 0.12443820690995414, "learning_rate": 1.5823471512782983e-06, "loss": 0.0291, "num_tokens": 154347579.0, "step": 2025 }, { "epoch": 2.526512788521522, "grad_norm": 0.12077454617168958, "learning_rate": 1.579369294718819e-06, "loss": 0.0279, "num_tokens": 154422999.0, "step": 2026 }, { "epoch": 2.527760449157829, "grad_norm": 0.11841856753505717, "learning_rate": 1.5763985474478483e-06, "loss": 0.028, "num_tokens": 154498846.0, "step": 2027 }, { "epoch": 2.529008109794136, "grad_norm": 0.1191743469311657, "learning_rate": 1.5734349148522471e-06, "loss": 0.0292, "num_tokens": 154576045.0, "step": 2028 }, { "epoch": 2.530255770430443, "grad_norm": 0.12040356577829175, "learning_rate": 1.5704784023059788e-06, "loss": 0.0293, "num_tokens": 154652119.0, "step": 2029 }, { "epoch": 2.5315034310667497, "grad_norm": 0.1235315800743608, "learning_rate": 1.5675290151700937e-06, "loss": 0.0299, "num_tokens": 154728765.0, "step": 2030 }, { "epoch": 2.532751091703057, "grad_norm": 0.11007266293522812, "learning_rate": 1.5645867587927208e-06, "loss": 0.0276, "num_tokens": 154804766.0, "step": 2031 }, { "epoch": 2.5339987523393637, "grad_norm": 0.14515242423085778, "learning_rate": 1.561651638509062e-06, "loss": 0.039, "num_tokens": 154881886.0, "step": 2032 }, { "epoch": 2.5352464129756704, "grad_norm": 0.11030630179121877, "learning_rate": 1.5587236596413773e-06, "loss": 0.0288, "num_tokens": 154957220.0, "step": 2033 }, { "epoch": 2.5364940736119777, "grad_norm": 0.11515001091522285, "learning_rate": 1.5558028274989778e-06, "loss": 0.0282, "num_tokens": 155033151.0, "step": 2034 }, { "epoch": 2.5377417342482844, "grad_norm": 0.12846274937409335, "learning_rate": 1.5528891473782126e-06, "loss": 0.0305, "num_tokens": 155110509.0, "step": 2035 }, { "epoch": 2.5389893948845916, "grad_norm": 0.13067981208487092, "learning_rate": 1.5499826245624674e-06, "loss": 0.0302, "num_tokens": 155187066.0, "step": 2036 }, { "epoch": 2.5402370555208984, "grad_norm": 0.11907261818233296, "learning_rate": 1.547083264322145e-06, "loss": 0.0292, "num_tokens": 155263148.0, "step": 2037 }, { "epoch": 2.541484716157205, "grad_norm": 0.12299689014144845, "learning_rate": 1.5441910719146616e-06, "loss": 0.0288, "num_tokens": 155340431.0, "step": 2038 }, { "epoch": 2.542732376793512, "grad_norm": 0.13015131026580284, "learning_rate": 1.541306052584437e-06, "loss": 0.0303, "num_tokens": 155416415.0, "step": 2039 }, { "epoch": 2.543980037429819, "grad_norm": 0.11049380134309983, "learning_rate": 1.5384282115628834e-06, "loss": 0.0293, "num_tokens": 155494379.0, "step": 2040 }, { "epoch": 2.545227698066126, "grad_norm": 0.1162771794153917, "learning_rate": 1.5355575540683953e-06, "loss": 0.0291, "num_tokens": 155569548.0, "step": 2041 }, { "epoch": 2.546475358702433, "grad_norm": 0.12174837271925616, "learning_rate": 1.5326940853063443e-06, "loss": 0.0279, "num_tokens": 155647393.0, "step": 2042 }, { "epoch": 2.54772301933874, "grad_norm": 0.13480309447667146, "learning_rate": 1.5298378104690636e-06, "loss": 0.0298, "num_tokens": 155724327.0, "step": 2043 }, { "epoch": 2.5489706799750467, "grad_norm": 0.11661321788253896, "learning_rate": 1.5269887347358414e-06, "loss": 0.0283, "num_tokens": 155801249.0, "step": 2044 }, { "epoch": 2.5502183406113534, "grad_norm": 0.1247113771553184, "learning_rate": 1.5241468632729161e-06, "loss": 0.0289, "num_tokens": 155877398.0, "step": 2045 }, { "epoch": 2.5514660012476607, "grad_norm": 0.12248316911861816, "learning_rate": 1.5213122012334572e-06, "loss": 0.0291, "num_tokens": 155953871.0, "step": 2046 }, { "epoch": 2.5527136618839674, "grad_norm": 0.11970375576013077, "learning_rate": 1.5184847537575647e-06, "loss": 0.0298, "num_tokens": 156030983.0, "step": 2047 }, { "epoch": 2.5539613225202746, "grad_norm": 0.11663205314443578, "learning_rate": 1.5156645259722565e-06, "loss": 0.0288, "num_tokens": 156107454.0, "step": 2048 }, { "epoch": 2.5552089831565814, "grad_norm": 0.13006782115149237, "learning_rate": 1.5128515229914568e-06, "loss": 0.0286, "num_tokens": 156183133.0, "step": 2049 }, { "epoch": 2.556456643792888, "grad_norm": 0.1205402849568315, "learning_rate": 1.5100457499159897e-06, "loss": 0.0287, "num_tokens": 156258840.0, "step": 2050 }, { "epoch": 2.5577043044291954, "grad_norm": 0.11732612306689322, "learning_rate": 1.507247211833572e-06, "loss": 0.0284, "num_tokens": 156334760.0, "step": 2051 }, { "epoch": 2.558951965065502, "grad_norm": 0.1096467375696786, "learning_rate": 1.5044559138187967e-06, "loss": 0.0282, "num_tokens": 156411498.0, "step": 2052 }, { "epoch": 2.5601996257018094, "grad_norm": 0.10682251478899628, "learning_rate": 1.5016718609331315e-06, "loss": 0.0277, "num_tokens": 156486644.0, "step": 2053 }, { "epoch": 2.561447286338116, "grad_norm": 0.1231457023538011, "learning_rate": 1.4988950582249061e-06, "loss": 0.0279, "num_tokens": 156562424.0, "step": 2054 }, { "epoch": 2.562694946974423, "grad_norm": 0.11102241679752041, "learning_rate": 1.4961255107293044e-06, "loss": 0.0284, "num_tokens": 156638766.0, "step": 2055 }, { "epoch": 2.5639426076107297, "grad_norm": 0.11864437556503948, "learning_rate": 1.4933632234683506e-06, "loss": 0.0286, "num_tokens": 156715589.0, "step": 2056 }, { "epoch": 2.565190268247037, "grad_norm": 0.11512676728660194, "learning_rate": 1.4906082014509088e-06, "loss": 0.0286, "num_tokens": 156792091.0, "step": 2057 }, { "epoch": 2.5664379288833437, "grad_norm": 0.11182137223682226, "learning_rate": 1.4878604496726653e-06, "loss": 0.0284, "num_tokens": 156867419.0, "step": 2058 }, { "epoch": 2.567685589519651, "grad_norm": 0.11138300175009733, "learning_rate": 1.4851199731161243e-06, "loss": 0.0278, "num_tokens": 156942944.0, "step": 2059 }, { "epoch": 2.5689332501559576, "grad_norm": 0.11691444098223876, "learning_rate": 1.4823867767505981e-06, "loss": 0.0283, "num_tokens": 157019631.0, "step": 2060 }, { "epoch": 2.5701809107922644, "grad_norm": 0.11570369933372947, "learning_rate": 1.4796608655322001e-06, "loss": 0.0286, "num_tokens": 157096265.0, "step": 2061 }, { "epoch": 2.571428571428571, "grad_norm": 0.12017368506601983, "learning_rate": 1.476942244403829e-06, "loss": 0.0288, "num_tokens": 157173740.0, "step": 2062 }, { "epoch": 2.5726762320648784, "grad_norm": 0.11019267524765289, "learning_rate": 1.4742309182951663e-06, "loss": 0.0288, "num_tokens": 157249715.0, "step": 2063 }, { "epoch": 2.573923892701185, "grad_norm": 0.10787044892409242, "learning_rate": 1.4715268921226677e-06, "loss": 0.0278, "num_tokens": 157325380.0, "step": 2064 }, { "epoch": 2.5751715533374924, "grad_norm": 0.11633281472117153, "learning_rate": 1.468830170789548e-06, "loss": 0.0287, "num_tokens": 157402317.0, "step": 2065 }, { "epoch": 2.576419213973799, "grad_norm": 0.11499276278186372, "learning_rate": 1.4661407591857795e-06, "loss": 0.0284, "num_tokens": 157478723.0, "step": 2066 }, { "epoch": 2.577666874610106, "grad_norm": 0.11385546589543385, "learning_rate": 1.4634586621880786e-06, "loss": 0.0291, "num_tokens": 157554399.0, "step": 2067 }, { "epoch": 2.578914535246413, "grad_norm": 0.11937387845295565, "learning_rate": 1.4607838846598959e-06, "loss": 0.0289, "num_tokens": 157629911.0, "step": 2068 }, { "epoch": 2.58016219588272, "grad_norm": 0.11544428870648846, "learning_rate": 1.4581164314514127e-06, "loss": 0.0287, "num_tokens": 157707624.0, "step": 2069 }, { "epoch": 2.581409856519027, "grad_norm": 0.12638135944647513, "learning_rate": 1.4554563073995284e-06, "loss": 0.0289, "num_tokens": 157782694.0, "step": 2070 }, { "epoch": 2.582657517155334, "grad_norm": 0.12283297777573274, "learning_rate": 1.452803517327852e-06, "loss": 0.0388, "num_tokens": 157859882.0, "step": 2071 }, { "epoch": 2.5839051777916406, "grad_norm": 0.11551837073593679, "learning_rate": 1.450158066046692e-06, "loss": 0.029, "num_tokens": 157935097.0, "step": 2072 }, { "epoch": 2.5851528384279474, "grad_norm": 0.11356655049306041, "learning_rate": 1.4475199583530536e-06, "loss": 0.0281, "num_tokens": 158010592.0, "step": 2073 }, { "epoch": 2.5864004990642546, "grad_norm": 0.1289783745736143, "learning_rate": 1.444889199030622e-06, "loss": 0.0284, "num_tokens": 158087479.0, "step": 2074 }, { "epoch": 2.5876481597005614, "grad_norm": 0.11276320187709267, "learning_rate": 1.4422657928497572e-06, "loss": 0.0283, "num_tokens": 158162638.0, "step": 2075 }, { "epoch": 2.5888958203368686, "grad_norm": 0.12913303698692233, "learning_rate": 1.4396497445674917e-06, "loss": 0.0291, "num_tokens": 158239511.0, "step": 2076 }, { "epoch": 2.5901434809731754, "grad_norm": 0.1161454313827607, "learning_rate": 1.4370410589275096e-06, "loss": 0.0291, "num_tokens": 158316739.0, "step": 2077 }, { "epoch": 2.591391141609482, "grad_norm": 0.12022429251512276, "learning_rate": 1.4344397406601454e-06, "loss": 0.0295, "num_tokens": 158392820.0, "step": 2078 }, { "epoch": 2.592638802245789, "grad_norm": 0.12182801317509814, "learning_rate": 1.4318457944823775e-06, "loss": 0.0289, "num_tokens": 158469327.0, "step": 2079 }, { "epoch": 2.593886462882096, "grad_norm": 0.12531918190640393, "learning_rate": 1.4292592250978137e-06, "loss": 0.0295, "num_tokens": 158544876.0, "step": 2080 }, { "epoch": 2.595134123518403, "grad_norm": 0.11731397679998512, "learning_rate": 1.4266800371966844e-06, "loss": 0.0297, "num_tokens": 158621884.0, "step": 2081 }, { "epoch": 2.59638178415471, "grad_norm": 0.12030922746281104, "learning_rate": 1.424108235455838e-06, "loss": 0.0289, "num_tokens": 158697140.0, "step": 2082 }, { "epoch": 2.597629444791017, "grad_norm": 0.12591593849077176, "learning_rate": 1.4215438245387303e-06, "loss": 0.03, "num_tokens": 158773147.0, "step": 2083 }, { "epoch": 2.5988771054273236, "grad_norm": 0.11333367538005146, "learning_rate": 1.41898680909541e-06, "loss": 0.0289, "num_tokens": 158848983.0, "step": 2084 }, { "epoch": 2.600124766063631, "grad_norm": 0.12222202988686626, "learning_rate": 1.4164371937625222e-06, "loss": 0.0298, "num_tokens": 158924781.0, "step": 2085 }, { "epoch": 2.6013724266999376, "grad_norm": 0.13082834567148519, "learning_rate": 1.4138949831632879e-06, "loss": 0.0295, "num_tokens": 159001962.0, "step": 2086 }, { "epoch": 2.6026200873362444, "grad_norm": 0.11153925261065922, "learning_rate": 1.4113601819075037e-06, "loss": 0.0279, "num_tokens": 159076678.0, "step": 2087 }, { "epoch": 2.6038677479725516, "grad_norm": 0.12512392901329744, "learning_rate": 1.4088327945915315e-06, "loss": 0.0298, "num_tokens": 159153608.0, "step": 2088 }, { "epoch": 2.6051154086088584, "grad_norm": 0.11413584500515658, "learning_rate": 1.4063128257982867e-06, "loss": 0.0289, "num_tokens": 159229793.0, "step": 2089 }, { "epoch": 2.606363069245165, "grad_norm": 0.10591067240737526, "learning_rate": 1.4038002800972362e-06, "loss": 0.0279, "num_tokens": 159304561.0, "step": 2090 }, { "epoch": 2.6076107298814724, "grad_norm": 0.10787585851556851, "learning_rate": 1.401295162044383e-06, "loss": 0.0288, "num_tokens": 159380713.0, "step": 2091 }, { "epoch": 2.608858390517779, "grad_norm": 0.11974792801092993, "learning_rate": 1.3987974761822656e-06, "loss": 0.0285, "num_tokens": 159457696.0, "step": 2092 }, { "epoch": 2.6101060511540863, "grad_norm": 0.11054859699617442, "learning_rate": 1.3963072270399411e-06, "loss": 0.028, "num_tokens": 159534657.0, "step": 2093 }, { "epoch": 2.611353711790393, "grad_norm": 0.11316028322498743, "learning_rate": 1.393824419132986e-06, "loss": 0.0281, "num_tokens": 159610294.0, "step": 2094 }, { "epoch": 2.6126013724267, "grad_norm": 0.11900041193474127, "learning_rate": 1.3913490569634796e-06, "loss": 0.029, "num_tokens": 159686082.0, "step": 2095 }, { "epoch": 2.6138490330630066, "grad_norm": 0.1265360706615686, "learning_rate": 1.388881145020002e-06, "loss": 0.0292, "num_tokens": 159762063.0, "step": 2096 }, { "epoch": 2.615096693699314, "grad_norm": 0.10932555533496009, "learning_rate": 1.3864206877776245e-06, "loss": 0.0283, "num_tokens": 159838048.0, "step": 2097 }, { "epoch": 2.6163443543356206, "grad_norm": 0.12179756460559203, "learning_rate": 1.3839676896978997e-06, "loss": 0.0292, "num_tokens": 159914786.0, "step": 2098 }, { "epoch": 2.617592014971928, "grad_norm": 0.11381057818569847, "learning_rate": 1.3815221552288541e-06, "loss": 0.0286, "num_tokens": 159990643.0, "step": 2099 }, { "epoch": 2.6188396756082346, "grad_norm": 0.12589741192291856, "learning_rate": 1.3790840888049802e-06, "loss": 0.0288, "num_tokens": 160066206.0, "step": 2100 }, { "epoch": 2.6200873362445414, "grad_norm": 0.1314569319576317, "learning_rate": 1.3766534948472307e-06, "loss": 0.0292, "num_tokens": 160141809.0, "step": 2101 }, { "epoch": 2.621334996880848, "grad_norm": 0.11817206295694255, "learning_rate": 1.3742303777630057e-06, "loss": 0.0296, "num_tokens": 160218086.0, "step": 2102 }, { "epoch": 2.6225826575171554, "grad_norm": 0.12899261334921272, "learning_rate": 1.3718147419461497e-06, "loss": 0.0297, "num_tokens": 160293689.0, "step": 2103 }, { "epoch": 2.623830318153462, "grad_norm": 0.13229897643742755, "learning_rate": 1.3694065917769414e-06, "loss": 0.0294, "num_tokens": 160370524.0, "step": 2104 }, { "epoch": 2.6250779787897693, "grad_norm": 0.1150696614738625, "learning_rate": 1.367005931622084e-06, "loss": 0.0281, "num_tokens": 160446520.0, "step": 2105 }, { "epoch": 2.626325639426076, "grad_norm": 0.12646685678510525, "learning_rate": 1.3646127658346992e-06, "loss": 0.0292, "num_tokens": 160522619.0, "step": 2106 }, { "epoch": 2.627573300062383, "grad_norm": 0.12074932800009718, "learning_rate": 1.3622270987543215e-06, "loss": 0.0303, "num_tokens": 160597550.0, "step": 2107 }, { "epoch": 2.62882096069869, "grad_norm": 0.12404470424703581, "learning_rate": 1.3598489347068858e-06, "loss": 0.03, "num_tokens": 160673517.0, "step": 2108 }, { "epoch": 2.630068621334997, "grad_norm": 0.12077222330645102, "learning_rate": 1.357478278004721e-06, "loss": 0.0289, "num_tokens": 160749412.0, "step": 2109 }, { "epoch": 2.631316281971304, "grad_norm": 0.12197047343054625, "learning_rate": 1.3551151329465462e-06, "loss": 0.0294, "num_tokens": 160827091.0, "step": 2110 }, { "epoch": 2.632563942607611, "grad_norm": 0.120305318419666, "learning_rate": 1.3527595038174566e-06, "loss": 0.0288, "num_tokens": 160903819.0, "step": 2111 }, { "epoch": 2.6338116032439176, "grad_norm": 0.10870153637376272, "learning_rate": 1.35041139488892e-06, "loss": 0.0276, "num_tokens": 160979263.0, "step": 2112 }, { "epoch": 2.6350592638802244, "grad_norm": 0.11925953541163148, "learning_rate": 1.3480708104187685e-06, "loss": 0.0289, "num_tokens": 161055934.0, "step": 2113 }, { "epoch": 2.6363069245165316, "grad_norm": 0.11768773201711762, "learning_rate": 1.3457377546511882e-06, "loss": 0.0285, "num_tokens": 161132407.0, "step": 2114 }, { "epoch": 2.6375545851528384, "grad_norm": 0.12563149415797625, "learning_rate": 1.3434122318167142e-06, "loss": 0.0285, "num_tokens": 161208444.0, "step": 2115 }, { "epoch": 2.6388022457891456, "grad_norm": 0.10760676490787521, "learning_rate": 1.3410942461322236e-06, "loss": 0.0273, "num_tokens": 161284775.0, "step": 2116 }, { "epoch": 2.6400499064254523, "grad_norm": 0.12383103239434214, "learning_rate": 1.3387838018009239e-06, "loss": 0.0296, "num_tokens": 161361877.0, "step": 2117 }, { "epoch": 2.641297567061759, "grad_norm": 0.12220346777866166, "learning_rate": 1.3364809030123477e-06, "loss": 0.0279, "num_tokens": 161438058.0, "step": 2118 }, { "epoch": 2.642545227698066, "grad_norm": 0.11878003052432862, "learning_rate": 1.3341855539423499e-06, "loss": 0.0298, "num_tokens": 161514402.0, "step": 2119 }, { "epoch": 2.643792888334373, "grad_norm": 0.1173199590403945, "learning_rate": 1.3318977587530907e-06, "loss": 0.0287, "num_tokens": 161590055.0, "step": 2120 }, { "epoch": 2.64504054897068, "grad_norm": 0.1246361351224797, "learning_rate": 1.3296175215930326e-06, "loss": 0.0287, "num_tokens": 161667736.0, "step": 2121 }, { "epoch": 2.646288209606987, "grad_norm": 0.11285791824953058, "learning_rate": 1.3273448465969376e-06, "loss": 0.0279, "num_tokens": 161742212.0, "step": 2122 }, { "epoch": 2.647535870243294, "grad_norm": 0.11252597487092157, "learning_rate": 1.3250797378858507e-06, "loss": 0.0286, "num_tokens": 161816984.0, "step": 2123 }, { "epoch": 2.6487835308796006, "grad_norm": 0.11263456395387882, "learning_rate": 1.3228221995670987e-06, "loss": 0.0286, "num_tokens": 161893337.0, "step": 2124 }, { "epoch": 2.650031191515908, "grad_norm": 0.12248242509666539, "learning_rate": 1.3205722357342807e-06, "loss": 0.0293, "num_tokens": 161968441.0, "step": 2125 }, { "epoch": 2.6512788521522146, "grad_norm": 0.10926169879500301, "learning_rate": 1.3183298504672626e-06, "loss": 0.0281, "num_tokens": 162044146.0, "step": 2126 }, { "epoch": 2.6525265127885214, "grad_norm": 0.11384117282592209, "learning_rate": 1.316095047832166e-06, "loss": 0.0286, "num_tokens": 162120564.0, "step": 2127 }, { "epoch": 2.6537741734248286, "grad_norm": 0.11378643996598749, "learning_rate": 1.3138678318813618e-06, "loss": 0.0281, "num_tokens": 162196708.0, "step": 2128 }, { "epoch": 2.6550218340611353, "grad_norm": 0.11191194350156596, "learning_rate": 1.3116482066534686e-06, "loss": 0.0284, "num_tokens": 162273273.0, "step": 2129 }, { "epoch": 2.656269494697442, "grad_norm": 0.1157012495472187, "learning_rate": 1.3094361761733356e-06, "loss": 0.0287, "num_tokens": 162349729.0, "step": 2130 }, { "epoch": 2.6575171553337493, "grad_norm": 0.12725034795133214, "learning_rate": 1.3072317444520449e-06, "loss": 0.03, "num_tokens": 162425187.0, "step": 2131 }, { "epoch": 2.658764815970056, "grad_norm": 0.12362640477139697, "learning_rate": 1.3050349154868946e-06, "loss": 0.0293, "num_tokens": 162501517.0, "step": 2132 }, { "epoch": 2.6600124766063633, "grad_norm": 0.11027867097576076, "learning_rate": 1.3028456932614019e-06, "loss": 0.028, "num_tokens": 162576306.0, "step": 2133 }, { "epoch": 2.66126013724267, "grad_norm": 0.1239816165697189, "learning_rate": 1.3006640817452873e-06, "loss": 0.0294, "num_tokens": 162652236.0, "step": 2134 }, { "epoch": 2.662507797878977, "grad_norm": 0.12208914364343404, "learning_rate": 1.2984900848944727e-06, "loss": 0.0286, "num_tokens": 162727601.0, "step": 2135 }, { "epoch": 2.6637554585152836, "grad_norm": 0.13101940016702904, "learning_rate": 1.2963237066510715e-06, "loss": 0.0307, "num_tokens": 162803461.0, "step": 2136 }, { "epoch": 2.665003119151591, "grad_norm": 0.11919262059257354, "learning_rate": 1.2941649509433808e-06, "loss": 0.0294, "num_tokens": 162879944.0, "step": 2137 }, { "epoch": 2.6662507797878976, "grad_norm": 0.11810995954153358, "learning_rate": 1.2920138216858791e-06, "loss": 0.0291, "num_tokens": 162955901.0, "step": 2138 }, { "epoch": 2.667498440424205, "grad_norm": 0.14155939684520275, "learning_rate": 1.289870322779212e-06, "loss": 0.0305, "num_tokens": 163032700.0, "step": 2139 }, { "epoch": 2.6687461010605116, "grad_norm": 0.11500433107198005, "learning_rate": 1.2877344581101922e-06, "loss": 0.0276, "num_tokens": 163108621.0, "step": 2140 }, { "epoch": 2.6699937616968183, "grad_norm": 0.11864582024908449, "learning_rate": 1.2856062315517885e-06, "loss": 0.0293, "num_tokens": 163186508.0, "step": 2141 }, { "epoch": 2.671241422333125, "grad_norm": 0.11900223617148727, "learning_rate": 1.2834856469631174e-06, "loss": 0.0288, "num_tokens": 163263228.0, "step": 2142 }, { "epoch": 2.6724890829694323, "grad_norm": 0.11641318239763454, "learning_rate": 1.28137270818944e-06, "loss": 0.0288, "num_tokens": 163339630.0, "step": 2143 }, { "epoch": 2.673736743605739, "grad_norm": 0.12490145643178303, "learning_rate": 1.279267419062155e-06, "loss": 0.028, "num_tokens": 163416071.0, "step": 2144 }, { "epoch": 2.6749844042420463, "grad_norm": 0.11237312172406205, "learning_rate": 1.2771697833987852e-06, "loss": 0.0287, "num_tokens": 163491679.0, "step": 2145 }, { "epoch": 2.676232064878353, "grad_norm": 0.12175369762015016, "learning_rate": 1.2750798050029782e-06, "loss": 0.0299, "num_tokens": 163567232.0, "step": 2146 }, { "epoch": 2.67747972551466, "grad_norm": 0.12367303304199498, "learning_rate": 1.272997487664499e-06, "loss": 0.0288, "num_tokens": 163645463.0, "step": 2147 }, { "epoch": 2.678727386150967, "grad_norm": 0.1404182279844831, "learning_rate": 1.2709228351592167e-06, "loss": 0.0308, "num_tokens": 163722602.0, "step": 2148 }, { "epoch": 2.679975046787274, "grad_norm": 0.11022690433972186, "learning_rate": 1.2688558512491032e-06, "loss": 0.0279, "num_tokens": 163798334.0, "step": 2149 }, { "epoch": 2.681222707423581, "grad_norm": 0.12539838289586516, "learning_rate": 1.2667965396822257e-06, "loss": 0.0308, "num_tokens": 163875207.0, "step": 2150 }, { "epoch": 2.682470368059888, "grad_norm": 0.11620046852726527, "learning_rate": 1.2647449041927385e-06, "loss": 0.0286, "num_tokens": 163950395.0, "step": 2151 }, { "epoch": 2.6837180286961946, "grad_norm": 0.11441802870799525, "learning_rate": 1.2627009485008754e-06, "loss": 0.0288, "num_tokens": 164025874.0, "step": 2152 }, { "epoch": 2.6849656893325013, "grad_norm": 0.12686677130328586, "learning_rate": 1.2606646763129476e-06, "loss": 0.0283, "num_tokens": 164102560.0, "step": 2153 }, { "epoch": 2.6862133499688086, "grad_norm": 0.11642503259858736, "learning_rate": 1.2586360913213315e-06, "loss": 0.0286, "num_tokens": 164178621.0, "step": 2154 }, { "epoch": 2.6874610106051153, "grad_norm": 0.123454048742059, "learning_rate": 1.256615197204465e-06, "loss": 0.0295, "num_tokens": 164257244.0, "step": 2155 }, { "epoch": 2.6887086712414225, "grad_norm": 0.12546748989789816, "learning_rate": 1.2546019976268403e-06, "loss": 0.03, "num_tokens": 164333838.0, "step": 2156 }, { "epoch": 2.6899563318777293, "grad_norm": 0.11383031726636761, "learning_rate": 1.2525964962389961e-06, "loss": 0.0281, "num_tokens": 164410088.0, "step": 2157 }, { "epoch": 2.691203992514036, "grad_norm": 0.11824387324543177, "learning_rate": 1.250598696677512e-06, "loss": 0.029, "num_tokens": 164487052.0, "step": 2158 }, { "epoch": 2.692451653150343, "grad_norm": 0.10958624171174712, "learning_rate": 1.2486086025650045e-06, "loss": 0.0276, "num_tokens": 164563947.0, "step": 2159 }, { "epoch": 2.69369931378665, "grad_norm": 0.1261007581520669, "learning_rate": 1.246626217510114e-06, "loss": 0.0292, "num_tokens": 164641006.0, "step": 2160 }, { "epoch": 2.694946974422957, "grad_norm": 0.11602850342843693, "learning_rate": 1.244651545107503e-06, "loss": 0.0285, "num_tokens": 164716872.0, "step": 2161 }, { "epoch": 2.696194635059264, "grad_norm": 0.12345956113398639, "learning_rate": 1.2426845889378516e-06, "loss": 0.0284, "num_tokens": 164792532.0, "step": 2162 }, { "epoch": 2.697442295695571, "grad_norm": 0.1134742454092433, "learning_rate": 1.2407253525678453e-06, "loss": 0.0288, "num_tokens": 164868337.0, "step": 2163 }, { "epoch": 2.6986899563318776, "grad_norm": 0.11240379817612962, "learning_rate": 1.2387738395501714e-06, "loss": 0.0286, "num_tokens": 164944340.0, "step": 2164 }, { "epoch": 2.699937616968185, "grad_norm": 0.11581388826993168, "learning_rate": 1.236830053423512e-06, "loss": 0.0283, "num_tokens": 165023931.0, "step": 2165 }, { "epoch": 2.7011852776044916, "grad_norm": 0.11453442673809472, "learning_rate": 1.2348939977125412e-06, "loss": 0.0279, "num_tokens": 165103314.0, "step": 2166 }, { "epoch": 2.7024329382407988, "grad_norm": 0.11155255093180133, "learning_rate": 1.2329656759279108e-06, "loss": 0.0287, "num_tokens": 165179114.0, "step": 2167 }, { "epoch": 2.7036805988771055, "grad_norm": 0.11330932351293212, "learning_rate": 1.2310450915662516e-06, "loss": 0.0279, "num_tokens": 165254021.0, "step": 2168 }, { "epoch": 2.7049282595134123, "grad_norm": 0.12186227732333973, "learning_rate": 1.229132248110165e-06, "loss": 0.0292, "num_tokens": 165330407.0, "step": 2169 }, { "epoch": 2.706175920149719, "grad_norm": 0.11674335927535405, "learning_rate": 1.2272271490282134e-06, "loss": 0.0287, "num_tokens": 165406621.0, "step": 2170 }, { "epoch": 2.7074235807860263, "grad_norm": 0.11965473757061497, "learning_rate": 1.2253297977749163e-06, "loss": 0.0276, "num_tokens": 165482134.0, "step": 2171 }, { "epoch": 2.708671241422333, "grad_norm": 0.11983758170547645, "learning_rate": 1.2234401977907468e-06, "loss": 0.0286, "num_tokens": 165557987.0, "step": 2172 }, { "epoch": 2.7099189020586403, "grad_norm": 0.11471324375894294, "learning_rate": 1.2215583525021203e-06, "loss": 0.0281, "num_tokens": 165634677.0, "step": 2173 }, { "epoch": 2.711166562694947, "grad_norm": 0.11355220218054589, "learning_rate": 1.2196842653213896e-06, "loss": 0.028, "num_tokens": 165710052.0, "step": 2174 }, { "epoch": 2.712414223331254, "grad_norm": 0.11767285515107533, "learning_rate": 1.2178179396468428e-06, "loss": 0.0278, "num_tokens": 165786084.0, "step": 2175 }, { "epoch": 2.7136618839675606, "grad_norm": 0.12160385618810454, "learning_rate": 1.215959378862692e-06, "loss": 0.0288, "num_tokens": 165863041.0, "step": 2176 }, { "epoch": 2.714909544603868, "grad_norm": 0.11461675756327239, "learning_rate": 1.2141085863390696e-06, "loss": 0.0286, "num_tokens": 165938910.0, "step": 2177 }, { "epoch": 2.7161572052401746, "grad_norm": 0.11525113632750776, "learning_rate": 1.2122655654320225e-06, "loss": 0.0285, "num_tokens": 166014302.0, "step": 2178 }, { "epoch": 2.717404865876482, "grad_norm": 0.11491608215364695, "learning_rate": 1.210430319483504e-06, "loss": 0.0277, "num_tokens": 166089644.0, "step": 2179 }, { "epoch": 2.7186525265127885, "grad_norm": 0.11524119513899224, "learning_rate": 1.2086028518213694e-06, "loss": 0.0282, "num_tokens": 166166298.0, "step": 2180 }, { "epoch": 2.7199001871490953, "grad_norm": 0.1286038266919602, "learning_rate": 1.206783165759371e-06, "loss": 0.03, "num_tokens": 166242302.0, "step": 2181 }, { "epoch": 2.7211478477854025, "grad_norm": 0.1285603705106346, "learning_rate": 1.204971264597148e-06, "loss": 0.0287, "num_tokens": 166320367.0, "step": 2182 }, { "epoch": 2.7223955084217093, "grad_norm": 0.12149048372392342, "learning_rate": 1.2031671516202263e-06, "loss": 0.0288, "num_tokens": 166396980.0, "step": 2183 }, { "epoch": 2.723643169058016, "grad_norm": 0.11689989628810289, "learning_rate": 1.2013708301000082e-06, "loss": 0.0282, "num_tokens": 166472334.0, "step": 2184 }, { "epoch": 2.7248908296943233, "grad_norm": 0.1296343978344365, "learning_rate": 1.199582303293767e-06, "loss": 0.03, "num_tokens": 166548633.0, "step": 2185 }, { "epoch": 2.72613849033063, "grad_norm": 0.11201979473864716, "learning_rate": 1.1978015744446417e-06, "loss": 0.0276, "num_tokens": 166624211.0, "step": 2186 }, { "epoch": 2.727386150966937, "grad_norm": 0.11959368808418909, "learning_rate": 1.1960286467816331e-06, "loss": 0.0294, "num_tokens": 166701131.0, "step": 2187 }, { "epoch": 2.728633811603244, "grad_norm": 0.1150404963886038, "learning_rate": 1.1942635235195949e-06, "loss": 0.028, "num_tokens": 166780144.0, "step": 2188 }, { "epoch": 2.729881472239551, "grad_norm": 0.11825528418985107, "learning_rate": 1.1925062078592279e-06, "loss": 0.0296, "num_tokens": 166856545.0, "step": 2189 }, { "epoch": 2.731129132875858, "grad_norm": 0.11280737921120523, "learning_rate": 1.190756702987077e-06, "loss": 0.0279, "num_tokens": 166932621.0, "step": 2190 }, { "epoch": 2.732376793512165, "grad_norm": 0.119452906940112, "learning_rate": 1.1890150120755244e-06, "loss": 0.0293, "num_tokens": 167009151.0, "step": 2191 }, { "epoch": 2.7336244541484715, "grad_norm": 0.14615475941269104, "learning_rate": 1.1872811382827811e-06, "loss": 0.03, "num_tokens": 167085945.0, "step": 2192 }, { "epoch": 2.7348721147847783, "grad_norm": 0.12157097414483191, "learning_rate": 1.1855550847528849e-06, "loss": 0.0278, "num_tokens": 167162551.0, "step": 2193 }, { "epoch": 2.7361197754210855, "grad_norm": 0.11817467596856256, "learning_rate": 1.1838368546156924e-06, "loss": 0.0295, "num_tokens": 167238923.0, "step": 2194 }, { "epoch": 2.7373674360573923, "grad_norm": 0.11504894992061995, "learning_rate": 1.182126450986874e-06, "loss": 0.0282, "num_tokens": 167314732.0, "step": 2195 }, { "epoch": 2.7386150966936995, "grad_norm": 0.12495964859507203, "learning_rate": 1.1804238769679077e-06, "loss": 0.0299, "num_tokens": 167391511.0, "step": 2196 }, { "epoch": 2.7398627573300063, "grad_norm": 0.11246368198397648, "learning_rate": 1.178729135646077e-06, "loss": 0.0278, "num_tokens": 167467831.0, "step": 2197 }, { "epoch": 2.741110417966313, "grad_norm": 0.12287539336021341, "learning_rate": 1.1770422300944586e-06, "loss": 0.028, "num_tokens": 167543169.0, "step": 2198 }, { "epoch": 2.74235807860262, "grad_norm": 0.1196178502547976, "learning_rate": 1.1753631633719217e-06, "loss": 0.0284, "num_tokens": 167618234.0, "step": 2199 }, { "epoch": 2.743605739238927, "grad_norm": 0.12728052422070063, "learning_rate": 1.1736919385231236e-06, "loss": 0.0298, "num_tokens": 167694674.0, "step": 2200 }, { "epoch": 2.744853399875234, "grad_norm": 0.11082580587324581, "learning_rate": 1.1720285585784983e-06, "loss": 0.0284, "num_tokens": 167770195.0, "step": 2201 }, { "epoch": 2.746101060511541, "grad_norm": 0.1126882445851649, "learning_rate": 1.1703730265542569e-06, "loss": 0.0287, "num_tokens": 167847001.0, "step": 2202 }, { "epoch": 2.747348721147848, "grad_norm": 0.11484684620419089, "learning_rate": 1.16872534545238e-06, "loss": 0.0282, "num_tokens": 167923273.0, "step": 2203 }, { "epoch": 2.7485963817841546, "grad_norm": 0.12201620108822543, "learning_rate": 1.1670855182606106e-06, "loss": 0.0284, "num_tokens": 168000228.0, "step": 2204 }, { "epoch": 2.7498440424204618, "grad_norm": 0.11672097206864245, "learning_rate": 1.1654535479524511e-06, "loss": 0.0295, "num_tokens": 168076189.0, "step": 2205 }, { "epoch": 2.7510917030567685, "grad_norm": 0.10967114609254536, "learning_rate": 1.163829437487158e-06, "loss": 0.0277, "num_tokens": 168154549.0, "step": 2206 }, { "epoch": 2.7523393636930757, "grad_norm": 0.12032675683422693, "learning_rate": 1.162213189809734e-06, "loss": 0.0296, "num_tokens": 168230773.0, "step": 2207 }, { "epoch": 2.7535870243293825, "grad_norm": 0.12109856736796651, "learning_rate": 1.1606048078509235e-06, "loss": 0.0291, "num_tokens": 168306819.0, "step": 2208 }, { "epoch": 2.7548346849656893, "grad_norm": 0.12014982496104747, "learning_rate": 1.1590042945272108e-06, "loss": 0.0291, "num_tokens": 168383185.0, "step": 2209 }, { "epoch": 2.756082345601996, "grad_norm": 0.11366355466444636, "learning_rate": 1.1574116527408093e-06, "loss": 0.0286, "num_tokens": 168459637.0, "step": 2210 }, { "epoch": 2.7573300062383033, "grad_norm": 0.12086871113763324, "learning_rate": 1.1558268853796597e-06, "loss": 0.0298, "num_tokens": 168536455.0, "step": 2211 }, { "epoch": 2.75857766687461, "grad_norm": 0.1208261526681414, "learning_rate": 1.1542499953174257e-06, "loss": 0.0294, "num_tokens": 168612474.0, "step": 2212 }, { "epoch": 2.7598253275109172, "grad_norm": 0.12495039793683214, "learning_rate": 1.1526809854134844e-06, "loss": 0.0301, "num_tokens": 168688888.0, "step": 2213 }, { "epoch": 2.761072988147224, "grad_norm": 0.11023696555976253, "learning_rate": 1.151119858512925e-06, "loss": 0.0277, "num_tokens": 168764636.0, "step": 2214 }, { "epoch": 2.762320648783531, "grad_norm": 0.12130654220412528, "learning_rate": 1.149566617446543e-06, "loss": 0.0279, "num_tokens": 168840715.0, "step": 2215 }, { "epoch": 2.7635683094198376, "grad_norm": 0.11126959655043199, "learning_rate": 1.1480212650308337e-06, "loss": 0.0288, "num_tokens": 168916777.0, "step": 2216 }, { "epoch": 2.7648159700561448, "grad_norm": 0.12573107467674868, "learning_rate": 1.1464838040679876e-06, "loss": 0.0291, "num_tokens": 168992599.0, "step": 2217 }, { "epoch": 2.7660636306924515, "grad_norm": 0.11635105667815067, "learning_rate": 1.1449542373458867e-06, "loss": 0.029, "num_tokens": 169069076.0, "step": 2218 }, { "epoch": 2.7673112913287587, "grad_norm": 0.11224018904577808, "learning_rate": 1.1434325676380983e-06, "loss": 0.0284, "num_tokens": 169144728.0, "step": 2219 }, { "epoch": 2.7685589519650655, "grad_norm": 0.11548677961325926, "learning_rate": 1.141918797703868e-06, "loss": 0.0277, "num_tokens": 169219973.0, "step": 2220 }, { "epoch": 2.7698066126013723, "grad_norm": 0.12508992711477832, "learning_rate": 1.1404129302881193e-06, "loss": 0.029, "num_tokens": 169296490.0, "step": 2221 }, { "epoch": 2.7710542732376795, "grad_norm": 0.12294871269999798, "learning_rate": 1.1389149681214456e-06, "loss": 0.0286, "num_tokens": 169372081.0, "step": 2222 }, { "epoch": 2.7723019338739863, "grad_norm": 0.1240743594501783, "learning_rate": 1.1374249139201035e-06, "loss": 0.0293, "num_tokens": 169448879.0, "step": 2223 }, { "epoch": 2.773549594510293, "grad_norm": 0.1189641989925224, "learning_rate": 1.135942770386013e-06, "loss": 0.0276, "num_tokens": 169525370.0, "step": 2224 }, { "epoch": 2.7747972551466002, "grad_norm": 0.12031710218780445, "learning_rate": 1.1344685402067475e-06, "loss": 0.0299, "num_tokens": 169601681.0, "step": 2225 }, { "epoch": 2.776044915782907, "grad_norm": 0.1145940404852384, "learning_rate": 1.1330022260555321e-06, "loss": 0.028, "num_tokens": 169677327.0, "step": 2226 }, { "epoch": 2.777292576419214, "grad_norm": 0.12036981134601163, "learning_rate": 1.1315438305912377e-06, "loss": 0.0287, "num_tokens": 169752650.0, "step": 2227 }, { "epoch": 2.778540237055521, "grad_norm": 0.1232602059747911, "learning_rate": 1.1300933564583764e-06, "loss": 0.028, "num_tokens": 169828354.0, "step": 2228 }, { "epoch": 2.7797878976918278, "grad_norm": 0.1160036562852578, "learning_rate": 1.1286508062870952e-06, "loss": 0.0284, "num_tokens": 169903693.0, "step": 2229 }, { "epoch": 2.781035558328135, "grad_norm": 0.11503613474247552, "learning_rate": 1.1272161826931745e-06, "loss": 0.0285, "num_tokens": 169978944.0, "step": 2230 }, { "epoch": 2.7822832189644418, "grad_norm": 0.11815421947653913, "learning_rate": 1.1257894882780206e-06, "loss": 0.0284, "num_tokens": 170054946.0, "step": 2231 }, { "epoch": 2.7835308796007485, "grad_norm": 0.118767676640114, "learning_rate": 1.1243707256286606e-06, "loss": 0.0284, "num_tokens": 170131746.0, "step": 2232 }, { "epoch": 2.7847785402370553, "grad_norm": 0.11476662209958129, "learning_rate": 1.1229598973177407e-06, "loss": 0.0286, "num_tokens": 170207963.0, "step": 2233 }, { "epoch": 2.7860262008733625, "grad_norm": 0.11806306817521028, "learning_rate": 1.1215570059035199e-06, "loss": 0.0293, "num_tokens": 170283230.0, "step": 2234 }, { "epoch": 2.7872738615096693, "grad_norm": 0.10882246659086171, "learning_rate": 1.1201620539298636e-06, "loss": 0.0276, "num_tokens": 170359684.0, "step": 2235 }, { "epoch": 2.7885215221459765, "grad_norm": 0.11492211796238391, "learning_rate": 1.1187750439262405e-06, "loss": 0.028, "num_tokens": 170435000.0, "step": 2236 }, { "epoch": 2.7897691827822833, "grad_norm": 0.1282203899930841, "learning_rate": 1.1173959784077207e-06, "loss": 0.0302, "num_tokens": 170511425.0, "step": 2237 }, { "epoch": 2.79101684341859, "grad_norm": 0.11459020963724369, "learning_rate": 1.1160248598749652e-06, "loss": 0.0287, "num_tokens": 170588124.0, "step": 2238 }, { "epoch": 2.7922645040548972, "grad_norm": 0.118219825937956, "learning_rate": 1.114661690814227e-06, "loss": 0.0286, "num_tokens": 170664662.0, "step": 2239 }, { "epoch": 2.793512164691204, "grad_norm": 0.12474399789247946, "learning_rate": 1.1133064736973443e-06, "loss": 0.0286, "num_tokens": 170741642.0, "step": 2240 }, { "epoch": 2.7947598253275108, "grad_norm": 0.11978611303131881, "learning_rate": 1.1119592109817346e-06, "loss": 0.0291, "num_tokens": 170817653.0, "step": 2241 }, { "epoch": 2.796007485963818, "grad_norm": 0.11840215504882184, "learning_rate": 1.1106199051103922e-06, "loss": 0.0288, "num_tokens": 170892935.0, "step": 2242 }, { "epoch": 2.7972551466001248, "grad_norm": 0.11623909659922183, "learning_rate": 1.109288558511884e-06, "loss": 0.0291, "num_tokens": 170969000.0, "step": 2243 }, { "epoch": 2.7985028072364315, "grad_norm": 0.11698666849697116, "learning_rate": 1.1079651736003441e-06, "loss": 0.0288, "num_tokens": 171044723.0, "step": 2244 }, { "epoch": 2.7997504678727387, "grad_norm": 0.13606371430590922, "learning_rate": 1.106649752775468e-06, "loss": 0.0293, "num_tokens": 171121538.0, "step": 2245 }, { "epoch": 2.8009981285090455, "grad_norm": 0.12048429868469147, "learning_rate": 1.1053422984225127e-06, "loss": 0.0281, "num_tokens": 171198077.0, "step": 2246 }, { "epoch": 2.8022457891453527, "grad_norm": 0.12470269426113269, "learning_rate": 1.1040428129122873e-06, "loss": 0.0279, "num_tokens": 171277362.0, "step": 2247 }, { "epoch": 2.8034934497816595, "grad_norm": 0.19130209142145127, "learning_rate": 1.102751298601152e-06, "loss": 0.0296, "num_tokens": 171354521.0, "step": 2248 }, { "epoch": 2.8047411104179663, "grad_norm": 0.12456455451583687, "learning_rate": 1.1014677578310128e-06, "loss": 0.029, "num_tokens": 171432360.0, "step": 2249 }, { "epoch": 2.805988771054273, "grad_norm": 0.11699408718174314, "learning_rate": 1.1001921929293172e-06, "loss": 0.0283, "num_tokens": 171507814.0, "step": 2250 }, { "epoch": 2.8072364316905802, "grad_norm": 0.12226521217229692, "learning_rate": 1.0989246062090495e-06, "loss": 0.0286, "num_tokens": 171585169.0, "step": 2251 }, { "epoch": 2.808484092326887, "grad_norm": 0.11991178229904237, "learning_rate": 1.0976649999687282e-06, "loss": 0.0296, "num_tokens": 171661276.0, "step": 2252 }, { "epoch": 2.809731752963194, "grad_norm": 0.10974299356259266, "learning_rate": 1.096413376492399e-06, "loss": 0.0281, "num_tokens": 171736965.0, "step": 2253 }, { "epoch": 2.810979413599501, "grad_norm": 0.12052441591094999, "learning_rate": 1.0951697380496343e-06, "loss": 0.0287, "num_tokens": 171812257.0, "step": 2254 }, { "epoch": 2.8122270742358078, "grad_norm": 0.11085207515729092, "learning_rate": 1.093934086895526e-06, "loss": 0.0271, "num_tokens": 171888869.0, "step": 2255 }, { "epoch": 2.8134747348721145, "grad_norm": 0.13794568182203568, "learning_rate": 1.0927064252706845e-06, "loss": 0.0291, "num_tokens": 171965014.0, "step": 2256 }, { "epoch": 2.8147223955084217, "grad_norm": 0.11756094006014182, "learning_rate": 1.0914867554012297e-06, "loss": 0.0291, "num_tokens": 172042712.0, "step": 2257 }, { "epoch": 2.8159700561447285, "grad_norm": 0.11949250572717551, "learning_rate": 1.090275079498793e-06, "loss": 0.0283, "num_tokens": 172117766.0, "step": 2258 }, { "epoch": 2.8172177167810357, "grad_norm": 0.12671245281109003, "learning_rate": 1.0890713997605085e-06, "loss": 0.0299, "num_tokens": 172194141.0, "step": 2259 }, { "epoch": 2.8184653774173425, "grad_norm": 0.11890427787757048, "learning_rate": 1.0878757183690112e-06, "loss": 0.0289, "num_tokens": 172271023.0, "step": 2260 }, { "epoch": 2.8197130380536493, "grad_norm": 0.11355197416439035, "learning_rate": 1.086688037492433e-06, "loss": 0.028, "num_tokens": 172346761.0, "step": 2261 }, { "epoch": 2.8209606986899565, "grad_norm": 0.1273777473876187, "learning_rate": 1.0855083592843985e-06, "loss": 0.0305, "num_tokens": 172422932.0, "step": 2262 }, { "epoch": 2.8222083593262632, "grad_norm": 0.12440355227618752, "learning_rate": 1.0843366858840209e-06, "loss": 0.029, "num_tokens": 172500459.0, "step": 2263 }, { "epoch": 2.8234560199625705, "grad_norm": 0.13976356807487542, "learning_rate": 1.0831730194158982e-06, "loss": 0.0308, "num_tokens": 172578112.0, "step": 2264 }, { "epoch": 2.824703680598877, "grad_norm": 0.11599206471109613, "learning_rate": 1.0820173619901093e-06, "loss": 0.0277, "num_tokens": 172653995.0, "step": 2265 }, { "epoch": 2.825951341235184, "grad_norm": 0.11559659458722597, "learning_rate": 1.08086971570221e-06, "loss": 0.028, "num_tokens": 172730223.0, "step": 2266 }, { "epoch": 2.8271990018714908, "grad_norm": 0.1195433989503934, "learning_rate": 1.0797300826332307e-06, "loss": 0.028, "num_tokens": 172805019.0, "step": 2267 }, { "epoch": 2.828446662507798, "grad_norm": 0.11876185697375152, "learning_rate": 1.07859846484967e-06, "loss": 0.028, "num_tokens": 172880699.0, "step": 2268 }, { "epoch": 2.8296943231441047, "grad_norm": 0.11684909395782095, "learning_rate": 1.0774748644034936e-06, "loss": 0.0291, "num_tokens": 172959158.0, "step": 2269 }, { "epoch": 2.830941983780412, "grad_norm": 0.10958538812294302, "learning_rate": 1.0763592833321277e-06, "loss": 0.0289, "num_tokens": 173034913.0, "step": 2270 }, { "epoch": 2.8321896444167187, "grad_norm": 0.11373588402175755, "learning_rate": 1.0752517236584595e-06, "loss": 0.0289, "num_tokens": 173110805.0, "step": 2271 }, { "epoch": 2.8334373050530255, "grad_norm": 0.12280476964744709, "learning_rate": 1.0741521873908283e-06, "loss": 0.0286, "num_tokens": 173186522.0, "step": 2272 }, { "epoch": 2.8346849656893323, "grad_norm": 0.12463575409519533, "learning_rate": 1.0730606765230257e-06, "loss": 0.0294, "num_tokens": 173264138.0, "step": 2273 }, { "epoch": 2.8359326263256395, "grad_norm": 0.11778463862290611, "learning_rate": 1.0719771930342913e-06, "loss": 0.0285, "num_tokens": 173340444.0, "step": 2274 }, { "epoch": 2.8371802869619462, "grad_norm": 0.12702722233670585, "learning_rate": 1.0709017388893075e-06, "loss": 0.0294, "num_tokens": 173416777.0, "step": 2275 }, { "epoch": 2.8384279475982535, "grad_norm": 0.11507472807146055, "learning_rate": 1.0698343160381987e-06, "loss": 0.0289, "num_tokens": 173493542.0, "step": 2276 }, { "epoch": 2.8396756082345602, "grad_norm": 0.13647975622257214, "learning_rate": 1.0687749264165248e-06, "loss": 0.0304, "num_tokens": 173571061.0, "step": 2277 }, { "epoch": 2.840923268870867, "grad_norm": 0.11531884608838848, "learning_rate": 1.067723571945279e-06, "loss": 0.0286, "num_tokens": 173650004.0, "step": 2278 }, { "epoch": 2.842170929507174, "grad_norm": 0.1243551197922493, "learning_rate": 1.0666802545308847e-06, "loss": 0.0285, "num_tokens": 173725813.0, "step": 2279 }, { "epoch": 2.843418590143481, "grad_norm": 0.1050977663187282, "learning_rate": 1.065644976065193e-06, "loss": 0.0285, "num_tokens": 173801065.0, "step": 2280 }, { "epoch": 2.8446662507797877, "grad_norm": 0.1104574422188529, "learning_rate": 1.0646177384254747e-06, "loss": 0.028, "num_tokens": 173875892.0, "step": 2281 }, { "epoch": 2.845913911416095, "grad_norm": 0.12767157960439646, "learning_rate": 1.063598543474423e-06, "loss": 0.0292, "num_tokens": 173952726.0, "step": 2282 }, { "epoch": 2.8471615720524017, "grad_norm": 0.11758328425535548, "learning_rate": 1.062587393060147e-06, "loss": 0.0298, "num_tokens": 174031294.0, "step": 2283 }, { "epoch": 2.8484092326887085, "grad_norm": 0.1193197581441782, "learning_rate": 1.0615842890161675e-06, "loss": 0.0298, "num_tokens": 174107939.0, "step": 2284 }, { "epoch": 2.8496568933250157, "grad_norm": 0.11396787069116965, "learning_rate": 1.0605892331614158e-06, "loss": 0.0286, "num_tokens": 174183146.0, "step": 2285 }, { "epoch": 2.8509045539613225, "grad_norm": 0.1108030173333656, "learning_rate": 1.0596022273002282e-06, "loss": 0.0278, "num_tokens": 174259415.0, "step": 2286 }, { "epoch": 2.8521522145976297, "grad_norm": 0.11734545931287618, "learning_rate": 1.0586232732223446e-06, "loss": 0.0291, "num_tokens": 174336612.0, "step": 2287 }, { "epoch": 2.8533998752339365, "grad_norm": 0.12867876195037828, "learning_rate": 1.0576523727029053e-06, "loss": 0.0289, "num_tokens": 174414524.0, "step": 2288 }, { "epoch": 2.8546475358702432, "grad_norm": 0.11748175909555085, "learning_rate": 1.0566895275024458e-06, "loss": 0.0283, "num_tokens": 174489815.0, "step": 2289 }, { "epoch": 2.85589519650655, "grad_norm": 0.11978284345118138, "learning_rate": 1.0557347393668966e-06, "loss": 0.029, "num_tokens": 174564919.0, "step": 2290 }, { "epoch": 2.857142857142857, "grad_norm": 0.12034753501508996, "learning_rate": 1.0547880100275755e-06, "loss": 0.0289, "num_tokens": 174640337.0, "step": 2291 }, { "epoch": 2.858390517779164, "grad_norm": 0.11619971380015172, "learning_rate": 1.0538493412011901e-06, "loss": 0.0275, "num_tokens": 174716258.0, "step": 2292 }, { "epoch": 2.859638178415471, "grad_norm": 0.11708515456949027, "learning_rate": 1.0529187345898304e-06, "loss": 0.0282, "num_tokens": 174792952.0, "step": 2293 }, { "epoch": 2.860885839051778, "grad_norm": 0.1167407498714762, "learning_rate": 1.0519961918809675e-06, "loss": 0.0284, "num_tokens": 174869657.0, "step": 2294 }, { "epoch": 2.8621334996880847, "grad_norm": 0.11555197503669004, "learning_rate": 1.05108171474745e-06, "loss": 0.0279, "num_tokens": 174944457.0, "step": 2295 }, { "epoch": 2.8633811603243915, "grad_norm": 0.12153004275913126, "learning_rate": 1.050175304847502e-06, "loss": 0.0285, "num_tokens": 175021106.0, "step": 2296 }, { "epoch": 2.8646288209606987, "grad_norm": 0.11834069413546869, "learning_rate": 1.0492769638247177e-06, "loss": 0.0287, "num_tokens": 175097270.0, "step": 2297 }, { "epoch": 2.8658764815970055, "grad_norm": 0.11656588674507007, "learning_rate": 1.0483866933080611e-06, "loss": 0.0279, "num_tokens": 175172911.0, "step": 2298 }, { "epoch": 2.8671241422333127, "grad_norm": 0.1085038466941198, "learning_rate": 1.0475044949118624e-06, "loss": 0.0282, "num_tokens": 175248640.0, "step": 2299 }, { "epoch": 2.8683718028696195, "grad_norm": 0.12037669922139323, "learning_rate": 1.0466303702358139e-06, "loss": 0.0296, "num_tokens": 175325430.0, "step": 2300 }, { "epoch": 2.8696194635059262, "grad_norm": 0.13236897091504393, "learning_rate": 1.0457643208649665e-06, "loss": 0.0284, "num_tokens": 175401029.0, "step": 2301 }, { "epoch": 2.8708671241422334, "grad_norm": 0.11858987566645689, "learning_rate": 1.044906348369731e-06, "loss": 0.0294, "num_tokens": 175476957.0, "step": 2302 }, { "epoch": 2.87211478477854, "grad_norm": 0.1227951804625425, "learning_rate": 1.0440564543058703e-06, "loss": 0.0288, "num_tokens": 175552076.0, "step": 2303 }, { "epoch": 2.8733624454148474, "grad_norm": 0.12523536720297093, "learning_rate": 1.0432146402144986e-06, "loss": 0.0301, "num_tokens": 175628963.0, "step": 2304 }, { "epoch": 2.874610106051154, "grad_norm": 0.11875115342024792, "learning_rate": 1.0423809076220805e-06, "loss": 0.0284, "num_tokens": 175704868.0, "step": 2305 }, { "epoch": 2.875857766687461, "grad_norm": 0.11671628776248034, "learning_rate": 1.041555258040425e-06, "loss": 0.0287, "num_tokens": 175780283.0, "step": 2306 }, { "epoch": 2.8771054273237677, "grad_norm": 0.12368134811625374, "learning_rate": 1.0407376929666833e-06, "loss": 0.0284, "num_tokens": 175856347.0, "step": 2307 }, { "epoch": 2.878353087960075, "grad_norm": 0.1162472062590892, "learning_rate": 1.0399282138833488e-06, "loss": 0.0288, "num_tokens": 175932649.0, "step": 2308 }, { "epoch": 2.8796007485963817, "grad_norm": 0.11556916437765882, "learning_rate": 1.039126822258252e-06, "loss": 0.0277, "num_tokens": 176007896.0, "step": 2309 }, { "epoch": 2.880848409232689, "grad_norm": 0.1119609539627241, "learning_rate": 1.0383335195445573e-06, "loss": 0.0287, "num_tokens": 176083475.0, "step": 2310 }, { "epoch": 2.8820960698689957, "grad_norm": 0.1387995361889247, "learning_rate": 1.0375483071807626e-06, "loss": 0.0304, "num_tokens": 176161678.0, "step": 2311 }, { "epoch": 2.8833437305053025, "grad_norm": 0.12502477938746268, "learning_rate": 1.036771186590696e-06, "loss": 0.0283, "num_tokens": 176238838.0, "step": 2312 }, { "epoch": 2.8845913911416092, "grad_norm": 0.1240241231270145, "learning_rate": 1.0360021591835108e-06, "loss": 0.0289, "num_tokens": 176315044.0, "step": 2313 }, { "epoch": 2.8858390517779164, "grad_norm": 0.11504412374999518, "learning_rate": 1.0352412263536868e-06, "loss": 0.0282, "num_tokens": 176392990.0, "step": 2314 }, { "epoch": 2.887086712414223, "grad_norm": 0.1218440945357655, "learning_rate": 1.0344883894810257e-06, "loss": 0.0284, "num_tokens": 176468026.0, "step": 2315 }, { "epoch": 2.8883343730505304, "grad_norm": 0.11561022011101908, "learning_rate": 1.033743649930647e-06, "loss": 0.0279, "num_tokens": 176543576.0, "step": 2316 }, { "epoch": 2.889582033686837, "grad_norm": 0.11506182188310878, "learning_rate": 1.03300700905299e-06, "loss": 0.0285, "num_tokens": 176618675.0, "step": 2317 }, { "epoch": 2.890829694323144, "grad_norm": 0.10953695860741473, "learning_rate": 1.0322784681838062e-06, "loss": 0.0278, "num_tokens": 176695934.0, "step": 2318 }, { "epoch": 2.892077354959451, "grad_norm": 0.11379689055717356, "learning_rate": 1.0315580286441616e-06, "loss": 0.0279, "num_tokens": 176773662.0, "step": 2319 }, { "epoch": 2.893325015595758, "grad_norm": 0.12620578142759756, "learning_rate": 1.0308456917404294e-06, "loss": 0.0287, "num_tokens": 176850476.0, "step": 2320 }, { "epoch": 2.8945726762320647, "grad_norm": 0.12447892729114969, "learning_rate": 1.0301414587642926e-06, "loss": 0.0287, "num_tokens": 176927212.0, "step": 2321 }, { "epoch": 2.895820336868372, "grad_norm": 0.11779830546114492, "learning_rate": 1.029445330992738e-06, "loss": 0.0287, "num_tokens": 177003895.0, "step": 2322 }, { "epoch": 2.8970679975046787, "grad_norm": 0.11577768724484899, "learning_rate": 1.0287573096880566e-06, "loss": 0.0283, "num_tokens": 177079543.0, "step": 2323 }, { "epoch": 2.8983156581409855, "grad_norm": 0.12451751141007653, "learning_rate": 1.028077396097838e-06, "loss": 0.0294, "num_tokens": 177157185.0, "step": 2324 }, { "epoch": 2.8995633187772927, "grad_norm": 0.11716404656090428, "learning_rate": 1.0274055914549708e-06, "loss": 0.0291, "num_tokens": 177233249.0, "step": 2325 }, { "epoch": 2.9008109794135994, "grad_norm": 0.11057577294351982, "learning_rate": 1.0267418969776405e-06, "loss": 0.0269, "num_tokens": 177308957.0, "step": 2326 }, { "epoch": 2.9020586400499067, "grad_norm": 0.1264787802133383, "learning_rate": 1.0260863138693264e-06, "loss": 0.0293, "num_tokens": 177385062.0, "step": 2327 }, { "epoch": 2.9033063006862134, "grad_norm": 0.11358632489611671, "learning_rate": 1.0254388433187975e-06, "loss": 0.0282, "num_tokens": 177460873.0, "step": 2328 }, { "epoch": 2.90455396132252, "grad_norm": 0.11835170817675868, "learning_rate": 1.0247994865001147e-06, "loss": 0.0285, "num_tokens": 177535848.0, "step": 2329 }, { "epoch": 2.905801621958827, "grad_norm": 0.1116111761311637, "learning_rate": 1.0241682445726246e-06, "loss": 0.0283, "num_tokens": 177610552.0, "step": 2330 }, { "epoch": 2.907049282595134, "grad_norm": 0.11670378910446504, "learning_rate": 1.0235451186809596e-06, "loss": 0.0278, "num_tokens": 177685467.0, "step": 2331 }, { "epoch": 2.908296943231441, "grad_norm": 0.1148751254783169, "learning_rate": 1.0229301099550352e-06, "loss": 0.0279, "num_tokens": 177761525.0, "step": 2332 }, { "epoch": 2.909544603867748, "grad_norm": 0.11532511319179199, "learning_rate": 1.0223232195100485e-06, "loss": 0.0281, "num_tokens": 177837853.0, "step": 2333 }, { "epoch": 2.910792264504055, "grad_norm": 0.12020968010672094, "learning_rate": 1.0217244484464758e-06, "loss": 0.0278, "num_tokens": 177915043.0, "step": 2334 }, { "epoch": 2.9120399251403617, "grad_norm": 0.11806265091105844, "learning_rate": 1.0211337978500687e-06, "loss": 0.0286, "num_tokens": 177990872.0, "step": 2335 }, { "epoch": 2.913287585776669, "grad_norm": 0.11329008165553046, "learning_rate": 1.0205512687918558e-06, "loss": 0.029, "num_tokens": 178066454.0, "step": 2336 }, { "epoch": 2.9145352464129757, "grad_norm": 0.12620894909003305, "learning_rate": 1.0199768623281388e-06, "loss": 0.0298, "num_tokens": 178142963.0, "step": 2337 }, { "epoch": 2.9157829070492824, "grad_norm": 0.12495079283176895, "learning_rate": 1.0194105795004896e-06, "loss": 0.0288, "num_tokens": 178217723.0, "step": 2338 }, { "epoch": 2.9170305676855897, "grad_norm": 0.11862596150519701, "learning_rate": 1.0188524213357507e-06, "loss": 0.0282, "num_tokens": 178293553.0, "step": 2339 }, { "epoch": 2.9182782283218964, "grad_norm": 0.1341753117881708, "learning_rate": 1.0183023888460312e-06, "loss": 0.0295, "num_tokens": 178372300.0, "step": 2340 }, { "epoch": 2.919525888958203, "grad_norm": 0.12185364553046377, "learning_rate": 1.017760483028706e-06, "loss": 0.0284, "num_tokens": 178448395.0, "step": 2341 }, { "epoch": 2.9207735495945104, "grad_norm": 0.11320330905305766, "learning_rate": 1.017226704866415e-06, "loss": 0.0283, "num_tokens": 178524884.0, "step": 2342 }, { "epoch": 2.922021210230817, "grad_norm": 0.11032447769007697, "learning_rate": 1.0167010553270588e-06, "loss": 0.0273, "num_tokens": 178601402.0, "step": 2343 }, { "epoch": 2.9232688708671244, "grad_norm": 0.11664488161577292, "learning_rate": 1.016183535363799e-06, "loss": 0.0278, "num_tokens": 178677354.0, "step": 2344 }, { "epoch": 2.924516531503431, "grad_norm": 0.12448061115121994, "learning_rate": 1.0156741459150556e-06, "loss": 0.0288, "num_tokens": 178752504.0, "step": 2345 }, { "epoch": 2.925764192139738, "grad_norm": 0.12087959836076952, "learning_rate": 1.0151728879045057e-06, "loss": 0.0282, "num_tokens": 178828590.0, "step": 2346 }, { "epoch": 2.9270118527760447, "grad_norm": 0.12213029669986336, "learning_rate": 1.0146797622410813e-06, "loss": 0.028, "num_tokens": 178905502.0, "step": 2347 }, { "epoch": 2.928259513412352, "grad_norm": 0.11036904360530286, "learning_rate": 1.0141947698189684e-06, "loss": 0.0283, "num_tokens": 178980958.0, "step": 2348 }, { "epoch": 2.9295071740486587, "grad_norm": 0.11503129305804824, "learning_rate": 1.0137179115176055e-06, "loss": 0.0279, "num_tokens": 179056800.0, "step": 2349 }, { "epoch": 2.930754834684966, "grad_norm": 0.11773797518401417, "learning_rate": 1.0132491882016805e-06, "loss": 0.0284, "num_tokens": 179132364.0, "step": 2350 }, { "epoch": 2.9320024953212727, "grad_norm": 0.11841160723334339, "learning_rate": 1.0127886007211298e-06, "loss": 0.0301, "num_tokens": 179208373.0, "step": 2351 }, { "epoch": 2.9332501559575794, "grad_norm": 0.11954849849392611, "learning_rate": 1.0123361499111383e-06, "loss": 0.0292, "num_tokens": 179285645.0, "step": 2352 }, { "epoch": 2.934497816593886, "grad_norm": 0.11647024615398077, "learning_rate": 1.011891836592136e-06, "loss": 0.0282, "num_tokens": 179361853.0, "step": 2353 }, { "epoch": 2.9357454772301934, "grad_norm": 0.11857643419166924, "learning_rate": 1.0114556615697971e-06, "loss": 0.0289, "num_tokens": 179436883.0, "step": 2354 }, { "epoch": 2.9369931378665, "grad_norm": 0.12316371816786122, "learning_rate": 1.0110276256350393e-06, "loss": 0.0293, "num_tokens": 179513523.0, "step": 2355 }, { "epoch": 2.9382407985028074, "grad_norm": 0.11002325656322508, "learning_rate": 1.010607729564021e-06, "loss": 0.0271, "num_tokens": 179588664.0, "step": 2356 }, { "epoch": 2.939488459139114, "grad_norm": 0.13014534677930756, "learning_rate": 1.0101959741181396e-06, "loss": 0.0296, "num_tokens": 179665065.0, "step": 2357 }, { "epoch": 2.940736119775421, "grad_norm": 0.12006219056776087, "learning_rate": 1.0097923600440335e-06, "loss": 0.0287, "num_tokens": 179740516.0, "step": 2358 }, { "epoch": 2.941983780411728, "grad_norm": 0.11782985541856984, "learning_rate": 1.0093968880735762e-06, "loss": 0.0287, "num_tokens": 179816567.0, "step": 2359 }, { "epoch": 2.943231441048035, "grad_norm": 0.12636100120057928, "learning_rate": 1.009009558923878e-06, "loss": 0.028, "num_tokens": 179892466.0, "step": 2360 }, { "epoch": 2.944479101684342, "grad_norm": 0.12246295728147075, "learning_rate": 1.0086303732972843e-06, "loss": 0.029, "num_tokens": 179969033.0, "step": 2361 }, { "epoch": 2.945726762320649, "grad_norm": 0.12183181729491675, "learning_rate": 1.0082593318813728e-06, "loss": 0.0292, "num_tokens": 180044689.0, "step": 2362 }, { "epoch": 2.9469744229569557, "grad_norm": 0.11666644572098828, "learning_rate": 1.0078964353489536e-06, "loss": 0.0296, "num_tokens": 180121893.0, "step": 2363 }, { "epoch": 2.9482220835932624, "grad_norm": 0.12254191085037336, "learning_rate": 1.0075416843580687e-06, "loss": 0.0283, "num_tokens": 180197860.0, "step": 2364 }, { "epoch": 2.9494697442295696, "grad_norm": 0.12166198014054956, "learning_rate": 1.0071950795519873e-06, "loss": 0.0288, "num_tokens": 180273997.0, "step": 2365 }, { "epoch": 2.9507174048658764, "grad_norm": 0.12047516412080388, "learning_rate": 1.00685662155921e-06, "loss": 0.0281, "num_tokens": 180349865.0, "step": 2366 }, { "epoch": 2.9519650655021836, "grad_norm": 0.12922197730204377, "learning_rate": 1.0065263109934633e-06, "loss": 0.028, "num_tokens": 180425678.0, "step": 2367 }, { "epoch": 2.9532127261384904, "grad_norm": 0.11396094054724218, "learning_rate": 1.0062041484536994e-06, "loss": 0.0283, "num_tokens": 180501674.0, "step": 2368 }, { "epoch": 2.954460386774797, "grad_norm": 0.12194886207173511, "learning_rate": 1.0058901345240967e-06, "loss": 0.028, "num_tokens": 180578582.0, "step": 2369 }, { "epoch": 2.955708047411104, "grad_norm": 0.11173158238574944, "learning_rate": 1.0055842697740576e-06, "loss": 0.0274, "num_tokens": 180652999.0, "step": 2370 }, { "epoch": 2.956955708047411, "grad_norm": 0.11706697991931227, "learning_rate": 1.0052865547582074e-06, "loss": 0.0293, "num_tokens": 180728953.0, "step": 2371 }, { "epoch": 2.958203368683718, "grad_norm": 0.1255232811981919, "learning_rate": 1.004996990016393e-06, "loss": 0.0292, "num_tokens": 180805626.0, "step": 2372 }, { "epoch": 2.959451029320025, "grad_norm": 0.11392151489402788, "learning_rate": 1.0047155760736828e-06, "loss": 0.0277, "num_tokens": 180881984.0, "step": 2373 }, { "epoch": 2.960698689956332, "grad_norm": 0.13404422054020804, "learning_rate": 1.004442313440366e-06, "loss": 0.03, "num_tokens": 180957570.0, "step": 2374 }, { "epoch": 2.9619463505926387, "grad_norm": 0.11808665816651781, "learning_rate": 1.0041772026119493e-06, "loss": 0.0279, "num_tokens": 181033179.0, "step": 2375 }, { "epoch": 2.963194011228946, "grad_norm": 0.1241735141067898, "learning_rate": 1.0039202440691598e-06, "loss": 0.0308, "num_tokens": 181110216.0, "step": 2376 }, { "epoch": 2.9644416718652526, "grad_norm": 0.11937831230038913, "learning_rate": 1.0036714382779405e-06, "loss": 0.0291, "num_tokens": 181187152.0, "step": 2377 }, { "epoch": 2.9656893325015594, "grad_norm": 0.11692757515863732, "learning_rate": 1.0034307856894511e-06, "loss": 0.0291, "num_tokens": 181263915.0, "step": 2378 }, { "epoch": 2.9669369931378666, "grad_norm": 0.12227927398147019, "learning_rate": 1.0031982867400683e-06, "loss": 0.029, "num_tokens": 181340275.0, "step": 2379 }, { "epoch": 2.9681846537741734, "grad_norm": 0.11978900526732714, "learning_rate": 1.0029739418513825e-06, "loss": 0.0291, "num_tokens": 181415540.0, "step": 2380 }, { "epoch": 2.96943231441048, "grad_norm": 0.13883180958152286, "learning_rate": 1.0027577514301988e-06, "loss": 0.0291, "num_tokens": 181491477.0, "step": 2381 }, { "epoch": 2.9706799750467874, "grad_norm": 0.12341835004033076, "learning_rate": 1.002549715868536e-06, "loss": 0.0286, "num_tokens": 181566773.0, "step": 2382 }, { "epoch": 2.971927635683094, "grad_norm": 0.11368537266706243, "learning_rate": 1.0023498355436255e-06, "loss": 0.028, "num_tokens": 181642670.0, "step": 2383 }, { "epoch": 2.9731752963194014, "grad_norm": 0.11944682523487593, "learning_rate": 1.0021581108179105e-06, "loss": 0.0294, "num_tokens": 181719527.0, "step": 2384 }, { "epoch": 2.974422956955708, "grad_norm": 0.12346774894332574, "learning_rate": 1.0019745420390455e-06, "loss": 0.0289, "num_tokens": 181796043.0, "step": 2385 }, { "epoch": 2.975670617592015, "grad_norm": 0.12541437580408807, "learning_rate": 1.001799129539897e-06, "loss": 0.029, "num_tokens": 181872950.0, "step": 2386 }, { "epoch": 2.9769182782283217, "grad_norm": 0.12878850331430744, "learning_rate": 1.0016318736385406e-06, "loss": 0.0293, "num_tokens": 181950529.0, "step": 2387 }, { "epoch": 2.978165938864629, "grad_norm": 0.12597349481901815, "learning_rate": 1.0014727746382615e-06, "loss": 0.0303, "num_tokens": 182026703.0, "step": 2388 }, { "epoch": 2.9794135995009356, "grad_norm": 0.12172216490214947, "learning_rate": 1.0013218328275544e-06, "loss": 0.0298, "num_tokens": 182103106.0, "step": 2389 }, { "epoch": 2.980661260137243, "grad_norm": 0.11138447238599976, "learning_rate": 1.0011790484801231e-06, "loss": 0.0279, "num_tokens": 182178585.0, "step": 2390 }, { "epoch": 2.9819089207735496, "grad_norm": 0.10906966830793707, "learning_rate": 1.0010444218548777e-06, "loss": 0.0275, "num_tokens": 182253254.0, "step": 2391 }, { "epoch": 2.9831565814098564, "grad_norm": 0.12012535808427474, "learning_rate": 1.0009179531959374e-06, "loss": 0.0281, "num_tokens": 182329628.0, "step": 2392 }, { "epoch": 2.984404242046163, "grad_norm": 0.11447485418209809, "learning_rate": 1.0007996427326282e-06, "loss": 0.0283, "num_tokens": 182404805.0, "step": 2393 }, { "epoch": 2.9856519026824704, "grad_norm": 0.1180082136584759, "learning_rate": 1.0006894906794828e-06, "loss": 0.0282, "num_tokens": 182480786.0, "step": 2394 }, { "epoch": 2.986899563318777, "grad_norm": 0.11790449657258598, "learning_rate": 1.0005874972362403e-06, "loss": 0.0287, "num_tokens": 182557495.0, "step": 2395 }, { "epoch": 2.9881472239550844, "grad_norm": 0.11528577093578535, "learning_rate": 1.000493662587845e-06, "loss": 0.0279, "num_tokens": 182633656.0, "step": 2396 }, { "epoch": 2.989394884591391, "grad_norm": 0.12016212997807887, "learning_rate": 1.0004079869044482e-06, "loss": 0.0286, "num_tokens": 182709398.0, "step": 2397 }, { "epoch": 2.990642545227698, "grad_norm": 0.11852576824631264, "learning_rate": 1.0003304703414053e-06, "loss": 0.0302, "num_tokens": 182785596.0, "step": 2398 }, { "epoch": 2.991890205864005, "grad_norm": 0.12423687799838075, "learning_rate": 1.0002611130392772e-06, "loss": 0.0296, "num_tokens": 182863057.0, "step": 2399 }, { "epoch": 2.993137866500312, "grad_norm": 0.13300645679134993, "learning_rate": 1.0001999151238303e-06, "loss": 0.0283, "num_tokens": 182939191.0, "step": 2400 }, { "epoch": 2.994385527136619, "grad_norm": 0.12445363603171215, "learning_rate": 1.0001468767060341e-06, "loss": 0.0297, "num_tokens": 183014674.0, "step": 2401 }, { "epoch": 2.995633187772926, "grad_norm": 0.12331291687142039, "learning_rate": 1.000101997882064e-06, "loss": 0.0279, "num_tokens": 183090765.0, "step": 2402 }, { "epoch": 2.9968808484092326, "grad_norm": 0.11985250982047893, "learning_rate": 1.0000652787332984e-06, "loss": 0.0295, "num_tokens": 183167359.0, "step": 2403 }, { "epoch": 2.9981285090455394, "grad_norm": 0.13275772055558846, "learning_rate": 1.0000367193263206e-06, "loss": 0.0294, "num_tokens": 183243243.0, "step": 2404 }, { "epoch": 2.9993761696818466, "grad_norm": 0.23449268816415472, "learning_rate": 1.000016319712917e-06, "loss": 0.0481, "num_tokens": 183319605.0, "step": 2405 }, { "epoch": 3.0, "grad_norm": 0.23449268816415472, "learning_rate": 1.0000040799300788e-06, "loss": 0.0262, "num_tokens": 183357177.0, "step": 2406 }, { "epoch": 3.0, "step": 2406, "total_flos": 3.555842322968822e+17, "train_loss": 0.05951405684243042, "train_runtime": 5065.1458, "train_samples_per_second": 60.733, "train_steps_per_second": 0.475 } ], "logging_steps": 1, "max_steps": 2406, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.555842322968822e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }