{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 171, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017543859649122806, "grad_norm": 3.9338779838849725, "learning_rate": 0.0, "loss": 1.2337, "num_tokens": 415561.0, "step": 1 }, { "epoch": 0.03508771929824561, "grad_norm": 4.061961880981431, "learning_rate": 1.6666666666666667e-06, "loss": 1.2551, "num_tokens": 811930.0, "step": 2 }, { "epoch": 0.05263157894736842, "grad_norm": 4.126747331618448, "learning_rate": 3.3333333333333333e-06, "loss": 1.2772, "num_tokens": 1198366.0, "step": 3 }, { "epoch": 0.07017543859649122, "grad_norm": 3.410910565563502, "learning_rate": 5e-06, "loss": 1.1776, "num_tokens": 1608933.0, "step": 4 }, { "epoch": 0.08771929824561403, "grad_norm": 2.233718748318594, "learning_rate": 6.666666666666667e-06, "loss": 0.9942, "num_tokens": 2068099.0, "step": 5 }, { "epoch": 0.10526315789473684, "grad_norm": 1.470021030676499, "learning_rate": 8.333333333333334e-06, "loss": 0.8448, "num_tokens": 2506575.0, "step": 6 }, { "epoch": 0.12280701754385964, "grad_norm": 1.4565364805839704, "learning_rate": 1e-05, "loss": 0.8213, "num_tokens": 2932014.0, "step": 7 }, { "epoch": 0.14035087719298245, "grad_norm": 2.3516845691230217, "learning_rate": 9.999184354855868e-06, "loss": 0.6275, "num_tokens": 3370254.0, "step": 8 }, { "epoch": 0.15789473684210525, "grad_norm": 1.8135724347288096, "learning_rate": 9.996737715102133e-06, "loss": 0.5931, "num_tokens": 3782161.0, "step": 9 }, { "epoch": 0.17543859649122806, "grad_norm": 1.4106997469551075, "learning_rate": 9.99266096766761e-06, "loss": 0.4933, "num_tokens": 4211970.0, "step": 10 }, { "epoch": 0.19298245614035087, "grad_norm": 0.8131031118317348, "learning_rate": 9.98695559040975e-06, "loss": 0.4229, "num_tokens": 4613367.0, "step": 11 }, { "epoch": 0.21052631578947367, "grad_norm": 0.7764134903704204, "learning_rate": 9.979623651578881e-06, "loss": 0.3888, "num_tokens": 5012484.0, "step": 12 }, { "epoch": 0.22807017543859648, "grad_norm": 0.37508885186302826, "learning_rate": 9.970667809068476e-06, "loss": 0.3783, "num_tokens": 5420781.0, "step": 13 }, { "epoch": 0.24561403508771928, "grad_norm": 0.3209831897748014, "learning_rate": 9.960091309451626e-06, "loss": 0.3308, "num_tokens": 5823019.0, "step": 14 }, { "epoch": 0.2631578947368421, "grad_norm": 0.2796386507143645, "learning_rate": 9.947897986804131e-06, "loss": 0.3456, "num_tokens": 6241736.0, "step": 15 }, { "epoch": 0.2807017543859649, "grad_norm": 0.25191330470494944, "learning_rate": 9.93409226131462e-06, "loss": 0.3172, "num_tokens": 6680804.0, "step": 16 }, { "epoch": 0.2982456140350877, "grad_norm": 0.2415593584049499, "learning_rate": 9.91867913768218e-06, "loss": 0.3253, "num_tokens": 7107939.0, "step": 17 }, { "epoch": 0.3157894736842105, "grad_norm": 0.3003770654260172, "learning_rate": 9.901664203302126e-06, "loss": 0.3213, "num_tokens": 7524281.0, "step": 18 }, { "epoch": 0.3333333333333333, "grad_norm": 0.3539478460817712, "learning_rate": 9.883053626240503e-06, "loss": 0.2966, "num_tokens": 7939796.0, "step": 19 }, { "epoch": 0.3508771929824561, "grad_norm": 0.2280690675113219, "learning_rate": 9.862854152998112e-06, "loss": 0.3043, "num_tokens": 8366406.0, "step": 20 }, { "epoch": 0.3684210526315789, "grad_norm": 0.21683554455375648, "learning_rate": 9.841073106064852e-06, "loss": 0.3057, "num_tokens": 8772526.0, "step": 21 }, { "epoch": 0.38596491228070173, "grad_norm": 0.19203110899204243, "learning_rate": 9.81771838126524e-06, "loss": 0.2899, "num_tokens": 9193705.0, "step": 22 }, { "epoch": 0.40350877192982454, "grad_norm": 0.21290762042544037, "learning_rate": 9.792798444896107e-06, "loss": 0.3073, "num_tokens": 9641419.0, "step": 23 }, { "epoch": 0.42105263157894735, "grad_norm": 0.1917960836643891, "learning_rate": 9.766322330657499e-06, "loss": 0.2921, "num_tokens": 10041552.0, "step": 24 }, { "epoch": 0.43859649122807015, "grad_norm": 0.18577210434865676, "learning_rate": 9.738299636377863e-06, "loss": 0.28, "num_tokens": 10476244.0, "step": 25 }, { "epoch": 0.45614035087719296, "grad_norm": 0.18309063662126532, "learning_rate": 9.70874052053476e-06, "loss": 0.2785, "num_tokens": 10863935.0, "step": 26 }, { "epoch": 0.47368421052631576, "grad_norm": 0.18434080604136077, "learning_rate": 9.677655698572326e-06, "loss": 0.2661, "num_tokens": 11259192.0, "step": 27 }, { "epoch": 0.49122807017543857, "grad_norm": 0.16956692098933204, "learning_rate": 9.645056439016827e-06, "loss": 0.273, "num_tokens": 11708724.0, "step": 28 }, { "epoch": 0.5087719298245614, "grad_norm": 0.16434051233500432, "learning_rate": 9.610954559391704e-06, "loss": 0.2667, "num_tokens": 12137403.0, "step": 29 }, { "epoch": 0.5263157894736842, "grad_norm": 0.17669427582556774, "learning_rate": 9.57536242193364e-06, "loss": 0.2584, "num_tokens": 12543239.0, "step": 30 }, { "epoch": 0.543859649122807, "grad_norm": 0.1652310917436867, "learning_rate": 9.538292929111114e-06, "loss": 0.2569, "num_tokens": 12940013.0, "step": 31 }, { "epoch": 0.5614035087719298, "grad_norm": 0.15646296852545927, "learning_rate": 9.499759518947156e-06, "loss": 0.2607, "num_tokens": 13409215.0, "step": 32 }, { "epoch": 0.5789473684210527, "grad_norm": 0.16249861759922887, "learning_rate": 9.459776160147941e-06, "loss": 0.2461, "num_tokens": 13806115.0, "step": 33 }, { "epoch": 0.5964912280701754, "grad_norm": 0.15613005334949157, "learning_rate": 9.418357347038999e-06, "loss": 0.2493, "num_tokens": 14248072.0, "step": 34 }, { "epoch": 0.6140350877192983, "grad_norm": 0.16063917924130028, "learning_rate": 9.375518094310904e-06, "loss": 0.255, "num_tokens": 14680852.0, "step": 35 }, { "epoch": 0.631578947368421, "grad_norm": 0.15840497254274827, "learning_rate": 9.331273931576306e-06, "loss": 0.2459, "num_tokens": 15109781.0, "step": 36 }, { "epoch": 0.6491228070175439, "grad_norm": 0.1549313386657812, "learning_rate": 9.285640897740316e-06, "loss": 0.2461, "num_tokens": 15556489.0, "step": 37 }, { "epoch": 0.6666666666666666, "grad_norm": 0.15967707273841447, "learning_rate": 9.238635535186247e-06, "loss": 0.2358, "num_tokens": 15975315.0, "step": 38 }, { "epoch": 0.6842105263157895, "grad_norm": 0.1664153879515682, "learning_rate": 9.19027488377886e-06, "loss": 0.2453, "num_tokens": 16366263.0, "step": 39 }, { "epoch": 0.7017543859649122, "grad_norm": 0.16085937077964602, "learning_rate": 9.140576474687263e-06, "loss": 0.2339, "num_tokens": 16780696.0, "step": 40 }, { "epoch": 0.7192982456140351, "grad_norm": 0.1525506357194124, "learning_rate": 9.0895583240297e-06, "loss": 0.2454, "num_tokens": 17226365.0, "step": 41 }, { "epoch": 0.7368421052631579, "grad_norm": 0.15414442592489289, "learning_rate": 9.037238926342544e-06, "loss": 0.2315, "num_tokens": 17651462.0, "step": 42 }, { "epoch": 0.7543859649122807, "grad_norm": 0.160113618784668, "learning_rate": 8.983637247875872e-06, "loss": 0.24, "num_tokens": 18064676.0, "step": 43 }, { "epoch": 0.7719298245614035, "grad_norm": 0.1592020616116995, "learning_rate": 8.92877271971802e-06, "loss": 0.236, "num_tokens": 18474727.0, "step": 44 }, { "epoch": 0.7894736842105263, "grad_norm": 0.15245337800135356, "learning_rate": 8.872665230751644e-06, "loss": 0.2405, "num_tokens": 18902046.0, "step": 45 }, { "epoch": 0.8070175438596491, "grad_norm": 0.20723449258057428, "learning_rate": 8.815335120443822e-06, "loss": 0.224, "num_tokens": 19319711.0, "step": 46 }, { "epoch": 0.8245614035087719, "grad_norm": 0.1474862726874557, "learning_rate": 8.756803171472817e-06, "loss": 0.2412, "num_tokens": 19757131.0, "step": 47 }, { "epoch": 0.8421052631578947, "grad_norm": 0.15052042969864995, "learning_rate": 8.69709060219416e-06, "loss": 0.2327, "num_tokens": 20194869.0, "step": 48 }, { "epoch": 0.8596491228070176, "grad_norm": 0.15385476085770403, "learning_rate": 8.636219058948823e-06, "loss": 0.2266, "num_tokens": 20597224.0, "step": 49 }, { "epoch": 0.8771929824561403, "grad_norm": 0.15228109362462297, "learning_rate": 8.574210608216206e-06, "loss": 0.239, "num_tokens": 21031895.0, "step": 50 }, { "epoch": 0.8947368421052632, "grad_norm": 0.14813164351615135, "learning_rate": 8.511087728614863e-06, "loss": 0.2271, "num_tokens": 21445108.0, "step": 51 }, { "epoch": 0.9122807017543859, "grad_norm": 0.14876791381975332, "learning_rate": 8.446873302753783e-06, "loss": 0.2277, "num_tokens": 21886020.0, "step": 52 }, { "epoch": 0.9298245614035088, "grad_norm": 0.14949287901477407, "learning_rate": 8.381590608937251e-06, "loss": 0.2395, "num_tokens": 22300331.0, "step": 53 }, { "epoch": 0.9473684210526315, "grad_norm": 0.1413875737037552, "learning_rate": 8.315263312726248e-06, "loss": 0.2321, "num_tokens": 22731352.0, "step": 54 }, { "epoch": 0.9649122807017544, "grad_norm": 0.14923397563157811, "learning_rate": 8.247915458359473e-06, "loss": 0.2169, "num_tokens": 23159887.0, "step": 55 }, { "epoch": 0.9824561403508771, "grad_norm": 0.15363902325932008, "learning_rate": 8.179571460037096e-06, "loss": 0.2357, "num_tokens": 23618660.0, "step": 56 }, { "epoch": 1.0, "grad_norm": 0.14687085036662337, "learning_rate": 8.110256093070393e-06, "loss": 0.2334, "num_tokens": 24039673.0, "step": 57 }, { "epoch": 1.0175438596491229, "grad_norm": 0.1476041876990848, "learning_rate": 8.039994484900463e-06, "loss": 0.2011, "num_tokens": 24434236.0, "step": 58 }, { "epoch": 1.0350877192982457, "grad_norm": 0.1419449695352204, "learning_rate": 7.968812105989316e-06, "loss": 0.2114, "num_tokens": 24853504.0, "step": 59 }, { "epoch": 1.0526315789473684, "grad_norm": 0.15337353318491537, "learning_rate": 7.896734760586599e-06, "loss": 0.2052, "num_tokens": 25255898.0, "step": 60 }, { "epoch": 1.0701754385964912, "grad_norm": 0.14113455068915454, "learning_rate": 7.82378857737533e-06, "loss": 0.2098, "num_tokens": 25667644.0, "step": 61 }, { "epoch": 1.087719298245614, "grad_norm": 0.14898616979585055, "learning_rate": 7.75e-06, "loss": 0.2041, "num_tokens": 26083880.0, "step": 62 }, { "epoch": 1.1052631578947367, "grad_norm": 0.15200676955309492, "learning_rate": 7.675395777480538e-06, "loss": 0.1911, "num_tokens": 26465737.0, "step": 63 }, { "epoch": 1.1228070175438596, "grad_norm": 0.15217717203165435, "learning_rate": 7.600002954515532e-06, "loss": 0.2113, "num_tokens": 26905038.0, "step": 64 }, { "epoch": 1.1403508771929824, "grad_norm": 0.1392806691058708, "learning_rate": 7.523848861678297e-06, "loss": 0.1981, "num_tokens": 27327803.0, "step": 65 }, { "epoch": 1.1578947368421053, "grad_norm": 0.14823210297550762, "learning_rate": 7.446961105509289e-06, "loss": 0.199, "num_tokens": 27743615.0, "step": 66 }, { "epoch": 1.1754385964912282, "grad_norm": 0.15025342094871685, "learning_rate": 7.36936755850849e-06, "loss": 0.2035, "num_tokens": 28159990.0, "step": 67 }, { "epoch": 1.1929824561403508, "grad_norm": 0.1424694767142918, "learning_rate": 7.2910963490313815e-06, "loss": 0.1997, "num_tokens": 28556831.0, "step": 68 }, { "epoch": 1.2105263157894737, "grad_norm": 0.14322092181407153, "learning_rate": 7.212175851092154e-06, "loss": 0.1961, "num_tokens": 28995194.0, "step": 69 }, { "epoch": 1.2280701754385965, "grad_norm": 0.1526511754370368, "learning_rate": 7.132634674077884e-06, "loss": 0.1921, "num_tokens": 29410916.0, "step": 70 }, { "epoch": 1.2456140350877192, "grad_norm": 0.139735447301268, "learning_rate": 7.052501652377368e-06, "loss": 0.2063, "num_tokens": 29913120.0, "step": 71 }, { "epoch": 1.263157894736842, "grad_norm": 0.14344116097027, "learning_rate": 6.971805834928399e-06, "loss": 0.2027, "num_tokens": 30351179.0, "step": 72 }, { "epoch": 1.280701754385965, "grad_norm": 0.1426174151835526, "learning_rate": 6.890576474687264e-06, "loss": 0.2064, "num_tokens": 30791604.0, "step": 73 }, { "epoch": 1.2982456140350878, "grad_norm": 0.14994921794042249, "learning_rate": 6.808843018024296e-06, "loss": 0.2065, "num_tokens": 31261315.0, "step": 74 }, { "epoch": 1.3157894736842106, "grad_norm": 0.15092392748993957, "learning_rate": 6.726635094049291e-06, "loss": 0.1908, "num_tokens": 31659426.0, "step": 75 }, { "epoch": 1.3333333333333333, "grad_norm": 0.14611148416178402, "learning_rate": 6.643982503870693e-06, "loss": 0.2051, "num_tokens": 32057379.0, "step": 76 }, { "epoch": 1.3508771929824561, "grad_norm": 0.1427497694651295, "learning_rate": 6.560915209792424e-06, "loss": 0.1992, "num_tokens": 32476948.0, "step": 77 }, { "epoch": 1.368421052631579, "grad_norm": 0.14625754427445514, "learning_rate": 6.477463324452286e-06, "loss": 0.1966, "num_tokens": 32880627.0, "step": 78 }, { "epoch": 1.3859649122807016, "grad_norm": 0.14428604329114683, "learning_rate": 6.393657099905854e-06, "loss": 0.2, "num_tokens": 33298268.0, "step": 79 }, { "epoch": 1.4035087719298245, "grad_norm": 0.14115165460163148, "learning_rate": 6.309526916659843e-06, "loss": 0.1961, "num_tokens": 33735278.0, "step": 80 }, { "epoch": 1.4210526315789473, "grad_norm": 0.14007382911380342, "learning_rate": 6.225103272658889e-06, "loss": 0.199, "num_tokens": 34180174.0, "step": 81 }, { "epoch": 1.4385964912280702, "grad_norm": 0.1464893959125568, "learning_rate": 6.140416772229785e-06, "loss": 0.1996, "num_tokens": 34600325.0, "step": 82 }, { "epoch": 1.456140350877193, "grad_norm": 0.14708924135818197, "learning_rate": 6.0554981149871276e-06, "loss": 0.1978, "num_tokens": 35028867.0, "step": 83 }, { "epoch": 1.4736842105263157, "grad_norm": 0.15137327410210355, "learning_rate": 5.970378084704441e-06, "loss": 0.1971, "num_tokens": 35465932.0, "step": 84 }, { "epoch": 1.4912280701754386, "grad_norm": 0.14187565136169733, "learning_rate": 5.88508753815478e-06, "loss": 0.1922, "num_tokens": 35857083.0, "step": 85 }, { "epoch": 1.5087719298245614, "grad_norm": 0.14750567085422958, "learning_rate": 5.799657393924869e-06, "loss": 0.1886, "num_tokens": 36243816.0, "step": 86 }, { "epoch": 1.526315789473684, "grad_norm": 0.13584876516210329, "learning_rate": 5.714118621206843e-06, "loss": 0.1949, "num_tokens": 36682515.0, "step": 87 }, { "epoch": 1.543859649122807, "grad_norm": 0.13138592337104565, "learning_rate": 5.6285022285716325e-06, "loss": 0.1848, "num_tokens": 37108482.0, "step": 88 }, { "epoch": 1.5614035087719298, "grad_norm": 0.14022318239136972, "learning_rate": 5.542839252728096e-06, "loss": 0.1986, "num_tokens": 37546779.0, "step": 89 }, { "epoch": 1.5789473684210527, "grad_norm": 0.14886854943483496, "learning_rate": 5.457160747271906e-06, "loss": 0.2025, "num_tokens": 37965357.0, "step": 90 }, { "epoch": 1.5964912280701755, "grad_norm": 0.14038303393485826, "learning_rate": 5.371497771428368e-06, "loss": 0.1975, "num_tokens": 38406113.0, "step": 91 }, { "epoch": 1.6140350877192984, "grad_norm": 0.14458491708947513, "learning_rate": 5.2858813787931605e-06, "loss": 0.1872, "num_tokens": 38800413.0, "step": 92 }, { "epoch": 1.631578947368421, "grad_norm": 0.1480350806263145, "learning_rate": 5.2003426060751324e-06, "loss": 0.1968, "num_tokens": 39200782.0, "step": 93 }, { "epoch": 1.6491228070175439, "grad_norm": 0.13730868120439885, "learning_rate": 5.114912461845223e-06, "loss": 0.2098, "num_tokens": 39666940.0, "step": 94 }, { "epoch": 1.6666666666666665, "grad_norm": 0.13357583490843425, "learning_rate": 5.02962191529556e-06, "loss": 0.2005, "num_tokens": 40103554.0, "step": 95 }, { "epoch": 1.6842105263157894, "grad_norm": 0.13277577033293314, "learning_rate": 4.944501885012875e-06, "loss": 0.1894, "num_tokens": 40523764.0, "step": 96 }, { "epoch": 1.7017543859649122, "grad_norm": 0.1445531104714593, "learning_rate": 4.859583227770218e-06, "loss": 0.1966, "num_tokens": 40923093.0, "step": 97 }, { "epoch": 1.719298245614035, "grad_norm": 0.1346892995124459, "learning_rate": 4.774896727341113e-06, "loss": 0.2085, "num_tokens": 41388977.0, "step": 98 }, { "epoch": 1.736842105263158, "grad_norm": 0.13950660376280916, "learning_rate": 4.6904730833401575e-06, "loss": 0.1993, "num_tokens": 41804049.0, "step": 99 }, { "epoch": 1.7543859649122808, "grad_norm": 0.13419027046897863, "learning_rate": 4.606342900094147e-06, "loss": 0.197, "num_tokens": 42212384.0, "step": 100 }, { "epoch": 1.7719298245614035, "grad_norm": 0.13471704895191902, "learning_rate": 4.5225366755477165e-06, "loss": 0.1991, "num_tokens": 42626890.0, "step": 101 }, { "epoch": 1.7894736842105263, "grad_norm": 0.1370431857295371, "learning_rate": 4.439084790207577e-06, "loss": 0.181, "num_tokens": 43045185.0, "step": 102 }, { "epoch": 1.807017543859649, "grad_norm": 0.13585245130540036, "learning_rate": 4.35601749612931e-06, "loss": 0.1926, "num_tokens": 43469318.0, "step": 103 }, { "epoch": 1.8245614035087718, "grad_norm": 0.14177193142263728, "learning_rate": 4.273364905950711e-06, "loss": 0.1883, "num_tokens": 43860735.0, "step": 104 }, { "epoch": 1.8421052631578947, "grad_norm": 0.13934124481189936, "learning_rate": 4.191156981975704e-06, "loss": 0.186, "num_tokens": 44259177.0, "step": 105 }, { "epoch": 1.8596491228070176, "grad_norm": 0.13757581593440663, "learning_rate": 4.109423525312738e-06, "loss": 0.1813, "num_tokens": 44652826.0, "step": 106 }, { "epoch": 1.8771929824561404, "grad_norm": 0.12994085331633362, "learning_rate": 4.028194165071603e-06, "loss": 0.2007, "num_tokens": 45110456.0, "step": 107 }, { "epoch": 1.8947368421052633, "grad_norm": 0.1315821088883057, "learning_rate": 3.9474983476226335e-06, "loss": 0.1984, "num_tokens": 45561128.0, "step": 108 }, { "epoch": 1.912280701754386, "grad_norm": 0.13466926193354203, "learning_rate": 3.867365325922116e-06, "loss": 0.195, "num_tokens": 45999442.0, "step": 109 }, { "epoch": 1.9298245614035088, "grad_norm": 0.1407044279595099, "learning_rate": 3.7878241489078473e-06, "loss": 0.1957, "num_tokens": 46405471.0, "step": 110 }, { "epoch": 1.9473684210526314, "grad_norm": 0.13869286010098864, "learning_rate": 3.7089036509686216e-06, "loss": 0.2096, "num_tokens": 46837446.0, "step": 111 }, { "epoch": 1.9649122807017543, "grad_norm": 0.13744222256799704, "learning_rate": 3.630632441491512e-06, "loss": 0.1874, "num_tokens": 47238683.0, "step": 112 }, { "epoch": 1.9824561403508771, "grad_norm": 0.13584540109090137, "learning_rate": 3.5530388944907124e-06, "loss": 0.1944, "num_tokens": 47661310.0, "step": 113 }, { "epoch": 2.0, "grad_norm": 0.14078927259649465, "learning_rate": 3.476151138321705e-06, "loss": 0.1893, "num_tokens": 48079346.0, "step": 114 }, { "epoch": 2.017543859649123, "grad_norm": 0.14801647762626233, "learning_rate": 3.3999970454844688e-06, "loss": 0.1833, "num_tokens": 48481444.0, "step": 115 }, { "epoch": 2.0350877192982457, "grad_norm": 0.14207211024797958, "learning_rate": 3.3246042225194626e-06, "loss": 0.1972, "num_tokens": 48904379.0, "step": 116 }, { "epoch": 2.0526315789473686, "grad_norm": 0.13459996651406178, "learning_rate": 3.2500000000000015e-06, "loss": 0.1734, "num_tokens": 49314280.0, "step": 117 }, { "epoch": 2.0701754385964914, "grad_norm": 0.13420885166445595, "learning_rate": 3.176211422624672e-06, "loss": 0.1748, "num_tokens": 49720807.0, "step": 118 }, { "epoch": 2.087719298245614, "grad_norm": 0.1396854400821284, "learning_rate": 3.103265239413401e-06, "loss": 0.1781, "num_tokens": 50151950.0, "step": 119 }, { "epoch": 2.1052631578947367, "grad_norm": 0.14321980578287785, "learning_rate": 3.0311878940106864e-06, "loss": 0.182, "num_tokens": 50574817.0, "step": 120 }, { "epoch": 2.1228070175438596, "grad_norm": 0.1402669344348115, "learning_rate": 2.9600055150995397e-06, "loss": 0.1804, "num_tokens": 50991178.0, "step": 121 }, { "epoch": 2.1403508771929824, "grad_norm": 0.14268249286817555, "learning_rate": 2.889743906929609e-06, "loss": 0.1701, "num_tokens": 51370437.0, "step": 122 }, { "epoch": 2.1578947368421053, "grad_norm": 0.13769468594260603, "learning_rate": 2.820428539962905e-06, "loss": 0.1803, "num_tokens": 51807382.0, "step": 123 }, { "epoch": 2.175438596491228, "grad_norm": 0.1418977810735795, "learning_rate": 2.7520845416405285e-06, "loss": 0.1867, "num_tokens": 52214420.0, "step": 124 }, { "epoch": 2.192982456140351, "grad_norm": 0.1357482680971467, "learning_rate": 2.6847366872737535e-06, "loss": 0.1855, "num_tokens": 52648228.0, "step": 125 }, { "epoch": 2.2105263157894735, "grad_norm": 0.13758984714319883, "learning_rate": 2.618409391062751e-06, "loss": 0.1928, "num_tokens": 53085345.0, "step": 126 }, { "epoch": 2.2280701754385963, "grad_norm": 0.13835249455110674, "learning_rate": 2.5531266972462176e-06, "loss": 0.1753, "num_tokens": 53493140.0, "step": 127 }, { "epoch": 2.245614035087719, "grad_norm": 0.13785872746102823, "learning_rate": 2.4889122713851397e-06, "loss": 0.1734, "num_tokens": 53922832.0, "step": 128 }, { "epoch": 2.263157894736842, "grad_norm": 0.1369141853241696, "learning_rate": 2.425789391783796e-06, "loss": 0.1771, "num_tokens": 54319035.0, "step": 129 }, { "epoch": 2.280701754385965, "grad_norm": 0.13274261857238875, "learning_rate": 2.36378094105118e-06, "loss": 0.1759, "num_tokens": 54754161.0, "step": 130 }, { "epoch": 2.2982456140350878, "grad_norm": 0.13374187150373806, "learning_rate": 2.302909397805841e-06, "loss": 0.1757, "num_tokens": 55177972.0, "step": 131 }, { "epoch": 2.3157894736842106, "grad_norm": 0.14056010545232225, "learning_rate": 2.2431968285271843e-06, "loss": 0.1762, "num_tokens": 55567532.0, "step": 132 }, { "epoch": 2.3333333333333335, "grad_norm": 0.14440093167538778, "learning_rate": 2.1846648795561777e-06, "loss": 0.1789, "num_tokens": 55997257.0, "step": 133 }, { "epoch": 2.3508771929824563, "grad_norm": 0.13209921359488944, "learning_rate": 2.1273347692483574e-06, "loss": 0.1683, "num_tokens": 56435780.0, "step": 134 }, { "epoch": 2.3684210526315788, "grad_norm": 0.13075669290172312, "learning_rate": 2.071227280281982e-06, "loss": 0.1766, "num_tokens": 56880449.0, "step": 135 }, { "epoch": 2.3859649122807016, "grad_norm": 0.15520159951824955, "learning_rate": 2.016362752124129e-06, "loss": 0.1766, "num_tokens": 57298925.0, "step": 136 }, { "epoch": 2.4035087719298245, "grad_norm": 0.13674994090087955, "learning_rate": 1.9627610736574575e-06, "loss": 0.1689, "num_tokens": 57702294.0, "step": 137 }, { "epoch": 2.4210526315789473, "grad_norm": 0.13321588166669115, "learning_rate": 1.9104416759703017e-06, "loss": 0.1758, "num_tokens": 58154418.0, "step": 138 }, { "epoch": 2.43859649122807, "grad_norm": 0.13632315461262803, "learning_rate": 1.8594235253127373e-06, "loss": 0.1767, "num_tokens": 58579683.0, "step": 139 }, { "epoch": 2.456140350877193, "grad_norm": 0.1288153252214618, "learning_rate": 1.8097251162211405e-06, "loss": 0.1811, "num_tokens": 59051562.0, "step": 140 }, { "epoch": 2.473684210526316, "grad_norm": 0.12987465966750822, "learning_rate": 1.7613644648137543e-06, "loss": 0.1774, "num_tokens": 59501698.0, "step": 141 }, { "epoch": 2.4912280701754383, "grad_norm": 0.1374425426257984, "learning_rate": 1.7143591022596846e-06, "loss": 0.1944, "num_tokens": 59955592.0, "step": 142 }, { "epoch": 2.5087719298245617, "grad_norm": 0.13531966438989285, "learning_rate": 1.6687260684236943e-06, "loss": 0.1805, "num_tokens": 60381274.0, "step": 143 }, { "epoch": 2.526315789473684, "grad_norm": 0.134925096600037, "learning_rate": 1.6244819056890975e-06, "loss": 0.171, "num_tokens": 60788438.0, "step": 144 }, { "epoch": 2.543859649122807, "grad_norm": 0.1407719819848176, "learning_rate": 1.5816426529610035e-06, "loss": 0.1714, "num_tokens": 61184551.0, "step": 145 }, { "epoch": 2.56140350877193, "grad_norm": 0.1359063010745857, "learning_rate": 1.5402238398520614e-06, "loss": 0.1758, "num_tokens": 61619839.0, "step": 146 }, { "epoch": 2.5789473684210527, "grad_norm": 0.13246835060504009, "learning_rate": 1.5002404810528452e-06, "loss": 0.1739, "num_tokens": 62045889.0, "step": 147 }, { "epoch": 2.5964912280701755, "grad_norm": 0.13964950522945552, "learning_rate": 1.4617070708888882e-06, "loss": 0.1718, "num_tokens": 62437900.0, "step": 148 }, { "epoch": 2.6140350877192984, "grad_norm": 0.13372294299780066, "learning_rate": 1.4246375780663613e-06, "loss": 0.1661, "num_tokens": 62843119.0, "step": 149 }, { "epoch": 2.6315789473684212, "grad_norm": 0.13805400712254826, "learning_rate": 1.389045440608296e-06, "loss": 0.1828, "num_tokens": 63262284.0, "step": 150 }, { "epoch": 2.6491228070175437, "grad_norm": 0.13808339180910903, "learning_rate": 1.354943560983175e-06, "loss": 0.1803, "num_tokens": 63701914.0, "step": 151 }, { "epoch": 2.6666666666666665, "grad_norm": 0.13165183261262675, "learning_rate": 1.3223443014276738e-06, "loss": 0.1774, "num_tokens": 64149759.0, "step": 152 }, { "epoch": 2.6842105263157894, "grad_norm": 0.1319243140393186, "learning_rate": 1.2912594794652406e-06, "loss": 0.1799, "num_tokens": 64602514.0, "step": 153 }, { "epoch": 2.7017543859649122, "grad_norm": 0.13538494698388223, "learning_rate": 1.2617003636221394e-06, "loss": 0.1694, "num_tokens": 64992832.0, "step": 154 }, { "epoch": 2.719298245614035, "grad_norm": 0.13498616412912304, "learning_rate": 1.2336776693425028e-06, "loss": 0.1707, "num_tokens": 65389001.0, "step": 155 }, { "epoch": 2.736842105263158, "grad_norm": 0.12924445948277097, "learning_rate": 1.2072015551038933e-06, "loss": 0.1692, "num_tokens": 65842297.0, "step": 156 }, { "epoch": 2.754385964912281, "grad_norm": 0.13206490698518994, "learning_rate": 1.1822816187347625e-06, "loss": 0.1719, "num_tokens": 66254756.0, "step": 157 }, { "epoch": 2.7719298245614032, "grad_norm": 0.13946311479316087, "learning_rate": 1.1589268939351499e-06, "loss": 0.1824, "num_tokens": 66653349.0, "step": 158 }, { "epoch": 2.7894736842105265, "grad_norm": 0.12958606648427337, "learning_rate": 1.1371458470018896e-06, "loss": 0.1758, "num_tokens": 67089072.0, "step": 159 }, { "epoch": 2.807017543859649, "grad_norm": 0.12836201287391627, "learning_rate": 1.1169463737594995e-06, "loss": 0.1725, "num_tokens": 67530318.0, "step": 160 }, { "epoch": 2.824561403508772, "grad_norm": 0.1312877766550337, "learning_rate": 1.0983357966978747e-06, "loss": 0.17, "num_tokens": 67943670.0, "step": 161 }, { "epoch": 2.8421052631578947, "grad_norm": 0.1292736067143446, "learning_rate": 1.0813208623178199e-06, "loss": 0.1759, "num_tokens": 68380170.0, "step": 162 }, { "epoch": 2.8596491228070176, "grad_norm": 0.13227684289793154, "learning_rate": 1.0659077386853817e-06, "loss": 0.1719, "num_tokens": 68808114.0, "step": 163 }, { "epoch": 2.8771929824561404, "grad_norm": 0.13694616326909123, "learning_rate": 1.0521020131958692e-06, "loss": 0.168, "num_tokens": 69191512.0, "step": 164 }, { "epoch": 2.8947368421052633, "grad_norm": 0.135480809490847, "learning_rate": 1.0399086905483752e-06, "loss": 0.1659, "num_tokens": 69582710.0, "step": 165 }, { "epoch": 2.912280701754386, "grad_norm": 0.13656650923592858, "learning_rate": 1.0293321909315242e-06, "loss": 0.1764, "num_tokens": 69995390.0, "step": 166 }, { "epoch": 2.9298245614035086, "grad_norm": 0.13037307085567856, "learning_rate": 1.0203763484211196e-06, "loss": 0.1737, "num_tokens": 70418613.0, "step": 167 }, { "epoch": 2.9473684210526314, "grad_norm": 0.13386161162103719, "learning_rate": 1.0130444095902514e-06, "loss": 0.1731, "num_tokens": 70848952.0, "step": 168 }, { "epoch": 2.9649122807017543, "grad_norm": 0.13477603098195323, "learning_rate": 1.0073390323323897e-06, "loss": 0.1859, "num_tokens": 71284648.0, "step": 169 }, { "epoch": 2.982456140350877, "grad_norm": 0.13335006132129937, "learning_rate": 1.0032622848978689e-06, "loss": 0.1727, "num_tokens": 71712021.0, "step": 170 }, { "epoch": 3.0, "grad_norm": 0.13952787638079295, "learning_rate": 1.000815645144134e-06, "loss": 0.1803, "num_tokens": 72119019.0, "step": 171 }, { "epoch": 3.0, "step": 171, "total_flos": 2.3168881532705178e+17, "train_loss": 0.25350178002614027, "train_runtime": 2712.8956, "train_samples_per_second": 8.068, "train_steps_per_second": 0.063 } ], "logging_steps": 1, "max_steps": 171, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.3168881532705178e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }