| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 171, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.017543859649122806, | |
| "grad_norm": 3.9338779838849725, | |
| "learning_rate": 0.0, | |
| "loss": 1.2337, | |
| "num_tokens": 415561.0, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.03508771929824561, | |
| "grad_norm": 4.061961880981431, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 1.2551, | |
| "num_tokens": 811930.0, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.05263157894736842, | |
| "grad_norm": 4.126747331618448, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 1.2772, | |
| "num_tokens": 1198366.0, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.07017543859649122, | |
| "grad_norm": 3.410910565563502, | |
| "learning_rate": 5e-06, | |
| "loss": 1.1776, | |
| "num_tokens": 1608933.0, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.08771929824561403, | |
| "grad_norm": 2.233718748318594, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.9942, | |
| "num_tokens": 2068099.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.10526315789473684, | |
| "grad_norm": 1.470021030676499, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 0.8448, | |
| "num_tokens": 2506575.0, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.12280701754385964, | |
| "grad_norm": 1.4565364805839704, | |
| "learning_rate": 1e-05, | |
| "loss": 0.8213, | |
| "num_tokens": 2932014.0, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.14035087719298245, | |
| "grad_norm": 2.3516845691230217, | |
| "learning_rate": 9.999184354855868e-06, | |
| "loss": 0.6275, | |
| "num_tokens": 3370254.0, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.15789473684210525, | |
| "grad_norm": 1.8135724347288096, | |
| "learning_rate": 9.996737715102133e-06, | |
| "loss": 0.5931, | |
| "num_tokens": 3782161.0, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.17543859649122806, | |
| "grad_norm": 1.4106997469551075, | |
| "learning_rate": 9.99266096766761e-06, | |
| "loss": 0.4933, | |
| "num_tokens": 4211970.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.19298245614035087, | |
| "grad_norm": 0.8131031118317348, | |
| "learning_rate": 9.98695559040975e-06, | |
| "loss": 0.4229, | |
| "num_tokens": 4613367.0, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.21052631578947367, | |
| "grad_norm": 0.7764134903704204, | |
| "learning_rate": 9.979623651578881e-06, | |
| "loss": 0.3888, | |
| "num_tokens": 5012484.0, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.22807017543859648, | |
| "grad_norm": 0.37508885186302826, | |
| "learning_rate": 9.970667809068476e-06, | |
| "loss": 0.3783, | |
| "num_tokens": 5420781.0, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.24561403508771928, | |
| "grad_norm": 0.3209831897748014, | |
| "learning_rate": 9.960091309451626e-06, | |
| "loss": 0.3308, | |
| "num_tokens": 5823019.0, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.2631578947368421, | |
| "grad_norm": 0.2796386507143645, | |
| "learning_rate": 9.947897986804131e-06, | |
| "loss": 0.3456, | |
| "num_tokens": 6241736.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.2807017543859649, | |
| "grad_norm": 0.25191330470494944, | |
| "learning_rate": 9.93409226131462e-06, | |
| "loss": 0.3172, | |
| "num_tokens": 6680804.0, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.2982456140350877, | |
| "grad_norm": 0.2415593584049499, | |
| "learning_rate": 9.91867913768218e-06, | |
| "loss": 0.3253, | |
| "num_tokens": 7107939.0, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.3157894736842105, | |
| "grad_norm": 0.3003770654260172, | |
| "learning_rate": 9.901664203302126e-06, | |
| "loss": 0.3213, | |
| "num_tokens": 7524281.0, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 0.3539478460817712, | |
| "learning_rate": 9.883053626240503e-06, | |
| "loss": 0.2966, | |
| "num_tokens": 7939796.0, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.3508771929824561, | |
| "grad_norm": 0.2280690675113219, | |
| "learning_rate": 9.862854152998112e-06, | |
| "loss": 0.3043, | |
| "num_tokens": 8366406.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.3684210526315789, | |
| "grad_norm": 0.21683554455375648, | |
| "learning_rate": 9.841073106064852e-06, | |
| "loss": 0.3057, | |
| "num_tokens": 8772526.0, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.38596491228070173, | |
| "grad_norm": 0.19203110899204243, | |
| "learning_rate": 9.81771838126524e-06, | |
| "loss": 0.2899, | |
| "num_tokens": 9193705.0, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.40350877192982454, | |
| "grad_norm": 0.21290762042544037, | |
| "learning_rate": 9.792798444896107e-06, | |
| "loss": 0.3073, | |
| "num_tokens": 9641419.0, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.42105263157894735, | |
| "grad_norm": 0.1917960836643891, | |
| "learning_rate": 9.766322330657499e-06, | |
| "loss": 0.2921, | |
| "num_tokens": 10041552.0, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.43859649122807015, | |
| "grad_norm": 0.18577210434865676, | |
| "learning_rate": 9.738299636377863e-06, | |
| "loss": 0.28, | |
| "num_tokens": 10476244.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.45614035087719296, | |
| "grad_norm": 0.18309063662126532, | |
| "learning_rate": 9.70874052053476e-06, | |
| "loss": 0.2785, | |
| "num_tokens": 10863935.0, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.47368421052631576, | |
| "grad_norm": 0.18434080604136077, | |
| "learning_rate": 9.677655698572326e-06, | |
| "loss": 0.2661, | |
| "num_tokens": 11259192.0, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.49122807017543857, | |
| "grad_norm": 0.16956692098933204, | |
| "learning_rate": 9.645056439016827e-06, | |
| "loss": 0.273, | |
| "num_tokens": 11708724.0, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.5087719298245614, | |
| "grad_norm": 0.16434051233500432, | |
| "learning_rate": 9.610954559391704e-06, | |
| "loss": 0.2667, | |
| "num_tokens": 12137403.0, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.5263157894736842, | |
| "grad_norm": 0.17669427582556774, | |
| "learning_rate": 9.57536242193364e-06, | |
| "loss": 0.2584, | |
| "num_tokens": 12543239.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.543859649122807, | |
| "grad_norm": 0.1652310917436867, | |
| "learning_rate": 9.538292929111114e-06, | |
| "loss": 0.2569, | |
| "num_tokens": 12940013.0, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.5614035087719298, | |
| "grad_norm": 0.15646296852545927, | |
| "learning_rate": 9.499759518947156e-06, | |
| "loss": 0.2607, | |
| "num_tokens": 13409215.0, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.5789473684210527, | |
| "grad_norm": 0.16249861759922887, | |
| "learning_rate": 9.459776160147941e-06, | |
| "loss": 0.2461, | |
| "num_tokens": 13806115.0, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.5964912280701754, | |
| "grad_norm": 0.15613005334949157, | |
| "learning_rate": 9.418357347038999e-06, | |
| "loss": 0.2493, | |
| "num_tokens": 14248072.0, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.6140350877192983, | |
| "grad_norm": 0.16063917924130028, | |
| "learning_rate": 9.375518094310904e-06, | |
| "loss": 0.255, | |
| "num_tokens": 14680852.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.631578947368421, | |
| "grad_norm": 0.15840497254274827, | |
| "learning_rate": 9.331273931576306e-06, | |
| "loss": 0.2459, | |
| "num_tokens": 15109781.0, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.6491228070175439, | |
| "grad_norm": 0.1549313386657812, | |
| "learning_rate": 9.285640897740316e-06, | |
| "loss": 0.2461, | |
| "num_tokens": 15556489.0, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 0.15967707273841447, | |
| "learning_rate": 9.238635535186247e-06, | |
| "loss": 0.2358, | |
| "num_tokens": 15975315.0, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.6842105263157895, | |
| "grad_norm": 0.1664153879515682, | |
| "learning_rate": 9.19027488377886e-06, | |
| "loss": 0.2453, | |
| "num_tokens": 16366263.0, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.7017543859649122, | |
| "grad_norm": 0.16085937077964602, | |
| "learning_rate": 9.140576474687263e-06, | |
| "loss": 0.2339, | |
| "num_tokens": 16780696.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.7192982456140351, | |
| "grad_norm": 0.1525506357194124, | |
| "learning_rate": 9.0895583240297e-06, | |
| "loss": 0.2454, | |
| "num_tokens": 17226365.0, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.7368421052631579, | |
| "grad_norm": 0.15414442592489289, | |
| "learning_rate": 9.037238926342544e-06, | |
| "loss": 0.2315, | |
| "num_tokens": 17651462.0, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.7543859649122807, | |
| "grad_norm": 0.160113618784668, | |
| "learning_rate": 8.983637247875872e-06, | |
| "loss": 0.24, | |
| "num_tokens": 18064676.0, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.7719298245614035, | |
| "grad_norm": 0.1592020616116995, | |
| "learning_rate": 8.92877271971802e-06, | |
| "loss": 0.236, | |
| "num_tokens": 18474727.0, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.7894736842105263, | |
| "grad_norm": 0.15245337800135356, | |
| "learning_rate": 8.872665230751644e-06, | |
| "loss": 0.2405, | |
| "num_tokens": 18902046.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.8070175438596491, | |
| "grad_norm": 0.20723449258057428, | |
| "learning_rate": 8.815335120443822e-06, | |
| "loss": 0.224, | |
| "num_tokens": 19319711.0, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.8245614035087719, | |
| "grad_norm": 0.1474862726874557, | |
| "learning_rate": 8.756803171472817e-06, | |
| "loss": 0.2412, | |
| "num_tokens": 19757131.0, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.8421052631578947, | |
| "grad_norm": 0.15052042969864995, | |
| "learning_rate": 8.69709060219416e-06, | |
| "loss": 0.2327, | |
| "num_tokens": 20194869.0, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.8596491228070176, | |
| "grad_norm": 0.15385476085770403, | |
| "learning_rate": 8.636219058948823e-06, | |
| "loss": 0.2266, | |
| "num_tokens": 20597224.0, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.8771929824561403, | |
| "grad_norm": 0.15228109362462297, | |
| "learning_rate": 8.574210608216206e-06, | |
| "loss": 0.239, | |
| "num_tokens": 21031895.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.8947368421052632, | |
| "grad_norm": 0.14813164351615135, | |
| "learning_rate": 8.511087728614863e-06, | |
| "loss": 0.2271, | |
| "num_tokens": 21445108.0, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.9122807017543859, | |
| "grad_norm": 0.14876791381975332, | |
| "learning_rate": 8.446873302753783e-06, | |
| "loss": 0.2277, | |
| "num_tokens": 21886020.0, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.9298245614035088, | |
| "grad_norm": 0.14949287901477407, | |
| "learning_rate": 8.381590608937251e-06, | |
| "loss": 0.2395, | |
| "num_tokens": 22300331.0, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.9473684210526315, | |
| "grad_norm": 0.1413875737037552, | |
| "learning_rate": 8.315263312726248e-06, | |
| "loss": 0.2321, | |
| "num_tokens": 22731352.0, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.9649122807017544, | |
| "grad_norm": 0.14923397563157811, | |
| "learning_rate": 8.247915458359473e-06, | |
| "loss": 0.2169, | |
| "num_tokens": 23159887.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.9824561403508771, | |
| "grad_norm": 0.15363902325932008, | |
| "learning_rate": 8.179571460037096e-06, | |
| "loss": 0.2357, | |
| "num_tokens": 23618660.0, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.14687085036662337, | |
| "learning_rate": 8.110256093070393e-06, | |
| "loss": 0.2334, | |
| "num_tokens": 24039673.0, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 1.0175438596491229, | |
| "grad_norm": 0.1476041876990848, | |
| "learning_rate": 8.039994484900463e-06, | |
| "loss": 0.2011, | |
| "num_tokens": 24434236.0, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 1.0350877192982457, | |
| "grad_norm": 0.1419449695352204, | |
| "learning_rate": 7.968812105989316e-06, | |
| "loss": 0.2114, | |
| "num_tokens": 24853504.0, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 1.0526315789473684, | |
| "grad_norm": 0.15337353318491537, | |
| "learning_rate": 7.896734760586599e-06, | |
| "loss": 0.2052, | |
| "num_tokens": 25255898.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.0701754385964912, | |
| "grad_norm": 0.14113455068915454, | |
| "learning_rate": 7.82378857737533e-06, | |
| "loss": 0.2098, | |
| "num_tokens": 25667644.0, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 1.087719298245614, | |
| "grad_norm": 0.14898616979585055, | |
| "learning_rate": 7.75e-06, | |
| "loss": 0.2041, | |
| "num_tokens": 26083880.0, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 1.1052631578947367, | |
| "grad_norm": 0.15200676955309492, | |
| "learning_rate": 7.675395777480538e-06, | |
| "loss": 0.1911, | |
| "num_tokens": 26465737.0, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 1.1228070175438596, | |
| "grad_norm": 0.15217717203165435, | |
| "learning_rate": 7.600002954515532e-06, | |
| "loss": 0.2113, | |
| "num_tokens": 26905038.0, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 1.1403508771929824, | |
| "grad_norm": 0.1392806691058708, | |
| "learning_rate": 7.523848861678297e-06, | |
| "loss": 0.1981, | |
| "num_tokens": 27327803.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 1.1578947368421053, | |
| "grad_norm": 0.14823210297550762, | |
| "learning_rate": 7.446961105509289e-06, | |
| "loss": 0.199, | |
| "num_tokens": 27743615.0, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 1.1754385964912282, | |
| "grad_norm": 0.15025342094871685, | |
| "learning_rate": 7.36936755850849e-06, | |
| "loss": 0.2035, | |
| "num_tokens": 28159990.0, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 1.1929824561403508, | |
| "grad_norm": 0.1424694767142918, | |
| "learning_rate": 7.2910963490313815e-06, | |
| "loss": 0.1997, | |
| "num_tokens": 28556831.0, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 1.2105263157894737, | |
| "grad_norm": 0.14322092181407153, | |
| "learning_rate": 7.212175851092154e-06, | |
| "loss": 0.1961, | |
| "num_tokens": 28995194.0, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 1.2280701754385965, | |
| "grad_norm": 0.1526511754370368, | |
| "learning_rate": 7.132634674077884e-06, | |
| "loss": 0.1921, | |
| "num_tokens": 29410916.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.2456140350877192, | |
| "grad_norm": 0.139735447301268, | |
| "learning_rate": 7.052501652377368e-06, | |
| "loss": 0.2063, | |
| "num_tokens": 29913120.0, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 1.263157894736842, | |
| "grad_norm": 0.14344116097027, | |
| "learning_rate": 6.971805834928399e-06, | |
| "loss": 0.2027, | |
| "num_tokens": 30351179.0, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 1.280701754385965, | |
| "grad_norm": 0.1426174151835526, | |
| "learning_rate": 6.890576474687264e-06, | |
| "loss": 0.2064, | |
| "num_tokens": 30791604.0, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 1.2982456140350878, | |
| "grad_norm": 0.14994921794042249, | |
| "learning_rate": 6.808843018024296e-06, | |
| "loss": 0.2065, | |
| "num_tokens": 31261315.0, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 1.3157894736842106, | |
| "grad_norm": 0.15092392748993957, | |
| "learning_rate": 6.726635094049291e-06, | |
| "loss": 0.1908, | |
| "num_tokens": 31659426.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 0.14611148416178402, | |
| "learning_rate": 6.643982503870693e-06, | |
| "loss": 0.2051, | |
| "num_tokens": 32057379.0, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 1.3508771929824561, | |
| "grad_norm": 0.1427497694651295, | |
| "learning_rate": 6.560915209792424e-06, | |
| "loss": 0.1992, | |
| "num_tokens": 32476948.0, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 1.368421052631579, | |
| "grad_norm": 0.14625754427445514, | |
| "learning_rate": 6.477463324452286e-06, | |
| "loss": 0.1966, | |
| "num_tokens": 32880627.0, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 1.3859649122807016, | |
| "grad_norm": 0.14428604329114683, | |
| "learning_rate": 6.393657099905854e-06, | |
| "loss": 0.2, | |
| "num_tokens": 33298268.0, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 1.4035087719298245, | |
| "grad_norm": 0.14115165460163148, | |
| "learning_rate": 6.309526916659843e-06, | |
| "loss": 0.1961, | |
| "num_tokens": 33735278.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.4210526315789473, | |
| "grad_norm": 0.14007382911380342, | |
| "learning_rate": 6.225103272658889e-06, | |
| "loss": 0.199, | |
| "num_tokens": 34180174.0, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 1.4385964912280702, | |
| "grad_norm": 0.1464893959125568, | |
| "learning_rate": 6.140416772229785e-06, | |
| "loss": 0.1996, | |
| "num_tokens": 34600325.0, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 1.456140350877193, | |
| "grad_norm": 0.14708924135818197, | |
| "learning_rate": 6.0554981149871276e-06, | |
| "loss": 0.1978, | |
| "num_tokens": 35028867.0, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 1.4736842105263157, | |
| "grad_norm": 0.15137327410210355, | |
| "learning_rate": 5.970378084704441e-06, | |
| "loss": 0.1971, | |
| "num_tokens": 35465932.0, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 1.4912280701754386, | |
| "grad_norm": 0.14187565136169733, | |
| "learning_rate": 5.88508753815478e-06, | |
| "loss": 0.1922, | |
| "num_tokens": 35857083.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.5087719298245614, | |
| "grad_norm": 0.14750567085422958, | |
| "learning_rate": 5.799657393924869e-06, | |
| "loss": 0.1886, | |
| "num_tokens": 36243816.0, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 1.526315789473684, | |
| "grad_norm": 0.13584876516210329, | |
| "learning_rate": 5.714118621206843e-06, | |
| "loss": 0.1949, | |
| "num_tokens": 36682515.0, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 1.543859649122807, | |
| "grad_norm": 0.13138592337104565, | |
| "learning_rate": 5.6285022285716325e-06, | |
| "loss": 0.1848, | |
| "num_tokens": 37108482.0, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 1.5614035087719298, | |
| "grad_norm": 0.14022318239136972, | |
| "learning_rate": 5.542839252728096e-06, | |
| "loss": 0.1986, | |
| "num_tokens": 37546779.0, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 1.5789473684210527, | |
| "grad_norm": 0.14886854943483496, | |
| "learning_rate": 5.457160747271906e-06, | |
| "loss": 0.2025, | |
| "num_tokens": 37965357.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.5964912280701755, | |
| "grad_norm": 0.14038303393485826, | |
| "learning_rate": 5.371497771428368e-06, | |
| "loss": 0.1975, | |
| "num_tokens": 38406113.0, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 1.6140350877192984, | |
| "grad_norm": 0.14458491708947513, | |
| "learning_rate": 5.2858813787931605e-06, | |
| "loss": 0.1872, | |
| "num_tokens": 38800413.0, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 1.631578947368421, | |
| "grad_norm": 0.1480350806263145, | |
| "learning_rate": 5.2003426060751324e-06, | |
| "loss": 0.1968, | |
| "num_tokens": 39200782.0, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 1.6491228070175439, | |
| "grad_norm": 0.13730868120439885, | |
| "learning_rate": 5.114912461845223e-06, | |
| "loss": 0.2098, | |
| "num_tokens": 39666940.0, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 0.13357583490843425, | |
| "learning_rate": 5.02962191529556e-06, | |
| "loss": 0.2005, | |
| "num_tokens": 40103554.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.6842105263157894, | |
| "grad_norm": 0.13277577033293314, | |
| "learning_rate": 4.944501885012875e-06, | |
| "loss": 0.1894, | |
| "num_tokens": 40523764.0, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 1.7017543859649122, | |
| "grad_norm": 0.1445531104714593, | |
| "learning_rate": 4.859583227770218e-06, | |
| "loss": 0.1966, | |
| "num_tokens": 40923093.0, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 1.719298245614035, | |
| "grad_norm": 0.1346892995124459, | |
| "learning_rate": 4.774896727341113e-06, | |
| "loss": 0.2085, | |
| "num_tokens": 41388977.0, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 1.736842105263158, | |
| "grad_norm": 0.13950660376280916, | |
| "learning_rate": 4.6904730833401575e-06, | |
| "loss": 0.1993, | |
| "num_tokens": 41804049.0, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 1.7543859649122808, | |
| "grad_norm": 0.13419027046897863, | |
| "learning_rate": 4.606342900094147e-06, | |
| "loss": 0.197, | |
| "num_tokens": 42212384.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.7719298245614035, | |
| "grad_norm": 0.13471704895191902, | |
| "learning_rate": 4.5225366755477165e-06, | |
| "loss": 0.1991, | |
| "num_tokens": 42626890.0, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 1.7894736842105263, | |
| "grad_norm": 0.1370431857295371, | |
| "learning_rate": 4.439084790207577e-06, | |
| "loss": 0.181, | |
| "num_tokens": 43045185.0, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.807017543859649, | |
| "grad_norm": 0.13585245130540036, | |
| "learning_rate": 4.35601749612931e-06, | |
| "loss": 0.1926, | |
| "num_tokens": 43469318.0, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 1.8245614035087718, | |
| "grad_norm": 0.14177193142263728, | |
| "learning_rate": 4.273364905950711e-06, | |
| "loss": 0.1883, | |
| "num_tokens": 43860735.0, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.8421052631578947, | |
| "grad_norm": 0.13934124481189936, | |
| "learning_rate": 4.191156981975704e-06, | |
| "loss": 0.186, | |
| "num_tokens": 44259177.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.8596491228070176, | |
| "grad_norm": 0.13757581593440663, | |
| "learning_rate": 4.109423525312738e-06, | |
| "loss": 0.1813, | |
| "num_tokens": 44652826.0, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 1.8771929824561404, | |
| "grad_norm": 0.12994085331633362, | |
| "learning_rate": 4.028194165071603e-06, | |
| "loss": 0.2007, | |
| "num_tokens": 45110456.0, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 1.8947368421052633, | |
| "grad_norm": 0.1315821088883057, | |
| "learning_rate": 3.9474983476226335e-06, | |
| "loss": 0.1984, | |
| "num_tokens": 45561128.0, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.912280701754386, | |
| "grad_norm": 0.13466926193354203, | |
| "learning_rate": 3.867365325922116e-06, | |
| "loss": 0.195, | |
| "num_tokens": 45999442.0, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.9298245614035088, | |
| "grad_norm": 0.1407044279595099, | |
| "learning_rate": 3.7878241489078473e-06, | |
| "loss": 0.1957, | |
| "num_tokens": 46405471.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.9473684210526314, | |
| "grad_norm": 0.13869286010098864, | |
| "learning_rate": 3.7089036509686216e-06, | |
| "loss": 0.2096, | |
| "num_tokens": 46837446.0, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.9649122807017543, | |
| "grad_norm": 0.13744222256799704, | |
| "learning_rate": 3.630632441491512e-06, | |
| "loss": 0.1874, | |
| "num_tokens": 47238683.0, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.9824561403508771, | |
| "grad_norm": 0.13584540109090137, | |
| "learning_rate": 3.5530388944907124e-06, | |
| "loss": 0.1944, | |
| "num_tokens": 47661310.0, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.14078927259649465, | |
| "learning_rate": 3.476151138321705e-06, | |
| "loss": 0.1893, | |
| "num_tokens": 48079346.0, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 2.017543859649123, | |
| "grad_norm": 0.14801647762626233, | |
| "learning_rate": 3.3999970454844688e-06, | |
| "loss": 0.1833, | |
| "num_tokens": 48481444.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 2.0350877192982457, | |
| "grad_norm": 0.14207211024797958, | |
| "learning_rate": 3.3246042225194626e-06, | |
| "loss": 0.1972, | |
| "num_tokens": 48904379.0, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 2.0526315789473686, | |
| "grad_norm": 0.13459996651406178, | |
| "learning_rate": 3.2500000000000015e-06, | |
| "loss": 0.1734, | |
| "num_tokens": 49314280.0, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 2.0701754385964914, | |
| "grad_norm": 0.13420885166445595, | |
| "learning_rate": 3.176211422624672e-06, | |
| "loss": 0.1748, | |
| "num_tokens": 49720807.0, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 2.087719298245614, | |
| "grad_norm": 0.1396854400821284, | |
| "learning_rate": 3.103265239413401e-06, | |
| "loss": 0.1781, | |
| "num_tokens": 50151950.0, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 2.1052631578947367, | |
| "grad_norm": 0.14321980578287785, | |
| "learning_rate": 3.0311878940106864e-06, | |
| "loss": 0.182, | |
| "num_tokens": 50574817.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.1228070175438596, | |
| "grad_norm": 0.1402669344348115, | |
| "learning_rate": 2.9600055150995397e-06, | |
| "loss": 0.1804, | |
| "num_tokens": 50991178.0, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 2.1403508771929824, | |
| "grad_norm": 0.14268249286817555, | |
| "learning_rate": 2.889743906929609e-06, | |
| "loss": 0.1701, | |
| "num_tokens": 51370437.0, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 2.1578947368421053, | |
| "grad_norm": 0.13769468594260603, | |
| "learning_rate": 2.820428539962905e-06, | |
| "loss": 0.1803, | |
| "num_tokens": 51807382.0, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 2.175438596491228, | |
| "grad_norm": 0.1418977810735795, | |
| "learning_rate": 2.7520845416405285e-06, | |
| "loss": 0.1867, | |
| "num_tokens": 52214420.0, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 2.192982456140351, | |
| "grad_norm": 0.1357482680971467, | |
| "learning_rate": 2.6847366872737535e-06, | |
| "loss": 0.1855, | |
| "num_tokens": 52648228.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 2.2105263157894735, | |
| "grad_norm": 0.13758984714319883, | |
| "learning_rate": 2.618409391062751e-06, | |
| "loss": 0.1928, | |
| "num_tokens": 53085345.0, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 2.2280701754385963, | |
| "grad_norm": 0.13835249455110674, | |
| "learning_rate": 2.5531266972462176e-06, | |
| "loss": 0.1753, | |
| "num_tokens": 53493140.0, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 2.245614035087719, | |
| "grad_norm": 0.13785872746102823, | |
| "learning_rate": 2.4889122713851397e-06, | |
| "loss": 0.1734, | |
| "num_tokens": 53922832.0, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 2.263157894736842, | |
| "grad_norm": 0.1369141853241696, | |
| "learning_rate": 2.425789391783796e-06, | |
| "loss": 0.1771, | |
| "num_tokens": 54319035.0, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 2.280701754385965, | |
| "grad_norm": 0.13274261857238875, | |
| "learning_rate": 2.36378094105118e-06, | |
| "loss": 0.1759, | |
| "num_tokens": 54754161.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.2982456140350878, | |
| "grad_norm": 0.13374187150373806, | |
| "learning_rate": 2.302909397805841e-06, | |
| "loss": 0.1757, | |
| "num_tokens": 55177972.0, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 2.3157894736842106, | |
| "grad_norm": 0.14056010545232225, | |
| "learning_rate": 2.2431968285271843e-06, | |
| "loss": 0.1762, | |
| "num_tokens": 55567532.0, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 2.3333333333333335, | |
| "grad_norm": 0.14440093167538778, | |
| "learning_rate": 2.1846648795561777e-06, | |
| "loss": 0.1789, | |
| "num_tokens": 55997257.0, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 2.3508771929824563, | |
| "grad_norm": 0.13209921359488944, | |
| "learning_rate": 2.1273347692483574e-06, | |
| "loss": 0.1683, | |
| "num_tokens": 56435780.0, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 2.3684210526315788, | |
| "grad_norm": 0.13075669290172312, | |
| "learning_rate": 2.071227280281982e-06, | |
| "loss": 0.1766, | |
| "num_tokens": 56880449.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 2.3859649122807016, | |
| "grad_norm": 0.15520159951824955, | |
| "learning_rate": 2.016362752124129e-06, | |
| "loss": 0.1766, | |
| "num_tokens": 57298925.0, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 2.4035087719298245, | |
| "grad_norm": 0.13674994090087955, | |
| "learning_rate": 1.9627610736574575e-06, | |
| "loss": 0.1689, | |
| "num_tokens": 57702294.0, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 2.4210526315789473, | |
| "grad_norm": 0.13321588166669115, | |
| "learning_rate": 1.9104416759703017e-06, | |
| "loss": 0.1758, | |
| "num_tokens": 58154418.0, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 2.43859649122807, | |
| "grad_norm": 0.13632315461262803, | |
| "learning_rate": 1.8594235253127373e-06, | |
| "loss": 0.1767, | |
| "num_tokens": 58579683.0, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 2.456140350877193, | |
| "grad_norm": 0.1288153252214618, | |
| "learning_rate": 1.8097251162211405e-06, | |
| "loss": 0.1811, | |
| "num_tokens": 59051562.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.473684210526316, | |
| "grad_norm": 0.12987465966750822, | |
| "learning_rate": 1.7613644648137543e-06, | |
| "loss": 0.1774, | |
| "num_tokens": 59501698.0, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 2.4912280701754383, | |
| "grad_norm": 0.1374425426257984, | |
| "learning_rate": 1.7143591022596846e-06, | |
| "loss": 0.1944, | |
| "num_tokens": 59955592.0, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 2.5087719298245617, | |
| "grad_norm": 0.13531966438989285, | |
| "learning_rate": 1.6687260684236943e-06, | |
| "loss": 0.1805, | |
| "num_tokens": 60381274.0, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 2.526315789473684, | |
| "grad_norm": 0.134925096600037, | |
| "learning_rate": 1.6244819056890975e-06, | |
| "loss": 0.171, | |
| "num_tokens": 60788438.0, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 2.543859649122807, | |
| "grad_norm": 0.1407719819848176, | |
| "learning_rate": 1.5816426529610035e-06, | |
| "loss": 0.1714, | |
| "num_tokens": 61184551.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 2.56140350877193, | |
| "grad_norm": 0.1359063010745857, | |
| "learning_rate": 1.5402238398520614e-06, | |
| "loss": 0.1758, | |
| "num_tokens": 61619839.0, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 2.5789473684210527, | |
| "grad_norm": 0.13246835060504009, | |
| "learning_rate": 1.5002404810528452e-06, | |
| "loss": 0.1739, | |
| "num_tokens": 62045889.0, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 2.5964912280701755, | |
| "grad_norm": 0.13964950522945552, | |
| "learning_rate": 1.4617070708888882e-06, | |
| "loss": 0.1718, | |
| "num_tokens": 62437900.0, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 2.6140350877192984, | |
| "grad_norm": 0.13372294299780066, | |
| "learning_rate": 1.4246375780663613e-06, | |
| "loss": 0.1661, | |
| "num_tokens": 62843119.0, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 2.6315789473684212, | |
| "grad_norm": 0.13805400712254826, | |
| "learning_rate": 1.389045440608296e-06, | |
| "loss": 0.1828, | |
| "num_tokens": 63262284.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.6491228070175437, | |
| "grad_norm": 0.13808339180910903, | |
| "learning_rate": 1.354943560983175e-06, | |
| "loss": 0.1803, | |
| "num_tokens": 63701914.0, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 0.13165183261262675, | |
| "learning_rate": 1.3223443014276738e-06, | |
| "loss": 0.1774, | |
| "num_tokens": 64149759.0, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 2.6842105263157894, | |
| "grad_norm": 0.1319243140393186, | |
| "learning_rate": 1.2912594794652406e-06, | |
| "loss": 0.1799, | |
| "num_tokens": 64602514.0, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 2.7017543859649122, | |
| "grad_norm": 0.13538494698388223, | |
| "learning_rate": 1.2617003636221394e-06, | |
| "loss": 0.1694, | |
| "num_tokens": 64992832.0, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 2.719298245614035, | |
| "grad_norm": 0.13498616412912304, | |
| "learning_rate": 1.2336776693425028e-06, | |
| "loss": 0.1707, | |
| "num_tokens": 65389001.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 2.736842105263158, | |
| "grad_norm": 0.12924445948277097, | |
| "learning_rate": 1.2072015551038933e-06, | |
| "loss": 0.1692, | |
| "num_tokens": 65842297.0, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 2.754385964912281, | |
| "grad_norm": 0.13206490698518994, | |
| "learning_rate": 1.1822816187347625e-06, | |
| "loss": 0.1719, | |
| "num_tokens": 66254756.0, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 2.7719298245614032, | |
| "grad_norm": 0.13946311479316087, | |
| "learning_rate": 1.1589268939351499e-06, | |
| "loss": 0.1824, | |
| "num_tokens": 66653349.0, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 2.7894736842105265, | |
| "grad_norm": 0.12958606648427337, | |
| "learning_rate": 1.1371458470018896e-06, | |
| "loss": 0.1758, | |
| "num_tokens": 67089072.0, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 2.807017543859649, | |
| "grad_norm": 0.12836201287391627, | |
| "learning_rate": 1.1169463737594995e-06, | |
| "loss": 0.1725, | |
| "num_tokens": 67530318.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.824561403508772, | |
| "grad_norm": 0.1312877766550337, | |
| "learning_rate": 1.0983357966978747e-06, | |
| "loss": 0.17, | |
| "num_tokens": 67943670.0, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 2.8421052631578947, | |
| "grad_norm": 0.1292736067143446, | |
| "learning_rate": 1.0813208623178199e-06, | |
| "loss": 0.1759, | |
| "num_tokens": 68380170.0, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 2.8596491228070176, | |
| "grad_norm": 0.13227684289793154, | |
| "learning_rate": 1.0659077386853817e-06, | |
| "loss": 0.1719, | |
| "num_tokens": 68808114.0, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 2.8771929824561404, | |
| "grad_norm": 0.13694616326909123, | |
| "learning_rate": 1.0521020131958692e-06, | |
| "loss": 0.168, | |
| "num_tokens": 69191512.0, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 2.8947368421052633, | |
| "grad_norm": 0.135480809490847, | |
| "learning_rate": 1.0399086905483752e-06, | |
| "loss": 0.1659, | |
| "num_tokens": 69582710.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 2.912280701754386, | |
| "grad_norm": 0.13656650923592858, | |
| "learning_rate": 1.0293321909315242e-06, | |
| "loss": 0.1764, | |
| "num_tokens": 69995390.0, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 2.9298245614035086, | |
| "grad_norm": 0.13037307085567856, | |
| "learning_rate": 1.0203763484211196e-06, | |
| "loss": 0.1737, | |
| "num_tokens": 70418613.0, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 2.9473684210526314, | |
| "grad_norm": 0.13386161162103719, | |
| "learning_rate": 1.0130444095902514e-06, | |
| "loss": 0.1731, | |
| "num_tokens": 70848952.0, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 2.9649122807017543, | |
| "grad_norm": 0.13477603098195323, | |
| "learning_rate": 1.0073390323323897e-06, | |
| "loss": 0.1859, | |
| "num_tokens": 71284648.0, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 2.982456140350877, | |
| "grad_norm": 0.13335006132129937, | |
| "learning_rate": 1.0032622848978689e-06, | |
| "loss": 0.1727, | |
| "num_tokens": 71712021.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.13952787638079295, | |
| "learning_rate": 1.000815645144134e-06, | |
| "loss": 0.1803, | |
| "num_tokens": 72119019.0, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 171, | |
| "total_flos": 2.3168881532705178e+17, | |
| "train_loss": 0.25350178002614027, | |
| "train_runtime": 2712.8956, | |
| "train_samples_per_second": 8.068, | |
| "train_steps_per_second": 0.063 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 171, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.3168881532705178e+17, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |