| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 10000000, | |
| "global_step": 1488, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.020161290322580645, | |
| "grad_norm": 104.71724700927734, | |
| "learning_rate": 1.2e-07, | |
| "loss": 1.4171, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04032258064516129, | |
| "grad_norm": 19.342660903930664, | |
| "learning_rate": 2.533333333333333e-07, | |
| "loss": 1.4035, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.06048387096774194, | |
| "grad_norm": 16.795289993286133, | |
| "learning_rate": 3.8666666666666664e-07, | |
| "loss": 1.25, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.08064516129032258, | |
| "grad_norm": 12.612614631652832, | |
| "learning_rate": 5.2e-07, | |
| "loss": 1.0432, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.10080645161290322, | |
| "grad_norm": 14.135688781738281, | |
| "learning_rate": 6.533333333333333e-07, | |
| "loss": 0.9721, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.12096774193548387, | |
| "grad_norm": 15.069091796875, | |
| "learning_rate": 7.866666666666666e-07, | |
| "loss": 0.8305, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.14112903225806453, | |
| "grad_norm": 12.399056434631348, | |
| "learning_rate": 9.2e-07, | |
| "loss": 0.7259, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.16129032258064516, | |
| "grad_norm": 8.677566528320312, | |
| "learning_rate": 9.999802270007193e-07, | |
| "loss": 0.673, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1814516129032258, | |
| "grad_norm": 10.323257446289062, | |
| "learning_rate": 9.997577987186727e-07, | |
| "loss": 0.6432, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.20161290322580644, | |
| "grad_norm": 10.148702621459961, | |
| "learning_rate": 9.992883362200682e-07, | |
| "loss": 0.6266, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2217741935483871, | |
| "grad_norm": 11.491229057312012, | |
| "learning_rate": 9.985720715639167e-07, | |
| "loss": 0.5784, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.24193548387096775, | |
| "grad_norm": 10.31007194519043, | |
| "learning_rate": 9.976093588054797e-07, | |
| "loss": 0.5034, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2620967741935484, | |
| "grad_norm": 8.234132766723633, | |
| "learning_rate": 9.964006738212574e-07, | |
| "loss": 0.5401, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.28225806451612906, | |
| "grad_norm": 11.453391075134277, | |
| "learning_rate": 9.949466140737583e-07, | |
| "loss": 0.5411, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3024193548387097, | |
| "grad_norm": 11.507453918457031, | |
| "learning_rate": 9.932478983161692e-07, | |
| "loss": 0.5281, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3225806451612903, | |
| "grad_norm": 12.632501602172852, | |
| "learning_rate": 9.913053662370705e-07, | |
| "loss": 0.4564, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.34274193548387094, | |
| "grad_norm": 9.758938789367676, | |
| "learning_rate": 9.891199780453699e-07, | |
| "loss": 0.4815, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.3629032258064516, | |
| "grad_norm": 11.363844871520996, | |
| "learning_rate": 9.866928139956655e-07, | |
| "loss": 0.5007, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.38306451612903225, | |
| "grad_norm": 10.484410285949707, | |
| "learning_rate": 9.840250738542662e-07, | |
| "loss": 0.4611, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.4032258064516129, | |
| "grad_norm": 12.169049263000488, | |
| "learning_rate": 9.811180763061378e-07, | |
| "loss": 0.4685, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.42338709677419356, | |
| "grad_norm": 9.939359664916992, | |
| "learning_rate": 9.77973258303067e-07, | |
| "loss": 0.4794, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.4435483870967742, | |
| "grad_norm": 9.921211242675781, | |
| "learning_rate": 9.745921743533651e-07, | |
| "loss": 0.3909, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.4637096774193548, | |
| "grad_norm": 11.779664993286133, | |
| "learning_rate": 9.709764957534615e-07, | |
| "loss": 0.4505, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.4838709677419355, | |
| "grad_norm": 9.654085159301758, | |
| "learning_rate": 9.671280097617692e-07, | |
| "loss": 0.4118, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5040322580645161, | |
| "grad_norm": 12.259678840637207, | |
| "learning_rate": 9.63048618715229e-07, | |
| "loss": 0.4373, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5241935483870968, | |
| "grad_norm": 12.087785720825195, | |
| "learning_rate": 9.58740339088969e-07, | |
| "loss": 0.3508, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5443548387096774, | |
| "grad_norm": 9.039093017578125, | |
| "learning_rate": 9.542053004995452e-07, | |
| "loss": 0.3848, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5645161290322581, | |
| "grad_norm": 10.91696834564209, | |
| "learning_rate": 9.494457446522555e-07, | |
| "loss": 0.3858, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.5846774193548387, | |
| "grad_norm": 11.28246021270752, | |
| "learning_rate": 9.444640242330468e-07, | |
| "loss": 0.4052, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6048387096774194, | |
| "grad_norm": 9.38158893585205, | |
| "learning_rate": 9.392626017455638e-07, | |
| "loss": 0.3984, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 13.301438331604004, | |
| "learning_rate": 9.338440482939145e-07, | |
| "loss": 0.3586, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.6451612903225806, | |
| "grad_norm": 12.870404243469238, | |
| "learning_rate": 9.282110423117524e-07, | |
| "loss": 0.3641, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.6653225806451613, | |
| "grad_norm": 11.24843978881836, | |
| "learning_rate": 9.223663682383066e-07, | |
| "loss": 0.3684, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6854838709677419, | |
| "grad_norm": 12.58388614654541, | |
| "learning_rate": 9.163129151420105e-07, | |
| "loss": 0.3954, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.7056451612903226, | |
| "grad_norm": 10.611601829528809, | |
| "learning_rate": 9.100536752924135e-07, | |
| "loss": 0.3774, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7258064516129032, | |
| "grad_norm": 12.370527267456055, | |
| "learning_rate": 9.035917426810781e-07, | |
| "loss": 0.3682, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.7459677419354839, | |
| "grad_norm": 12.168606758117676, | |
| "learning_rate": 8.969303114921956e-07, | |
| "loss": 0.3929, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.7661290322580645, | |
| "grad_norm": 10.513528823852539, | |
| "learning_rate": 8.900726745236751e-07, | |
| "loss": 0.3525, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.7862903225806451, | |
| "grad_norm": 10.08989429473877, | |
| "learning_rate": 8.83022221559489e-07, | |
| "loss": 0.3961, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.8064516129032258, | |
| "grad_norm": 10.106974601745605, | |
| "learning_rate": 8.757824376940745e-07, | |
| "loss": 0.3609, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.8266129032258065, | |
| "grad_norm": 12.857497215270996, | |
| "learning_rate": 8.68356901609625e-07, | |
| "loss": 0.404, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.8467741935483871, | |
| "grad_norm": 12.46678352355957, | |
| "learning_rate": 8.60749283807119e-07, | |
| "loss": 0.3438, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.8669354838709677, | |
| "grad_norm": 14.995087623596191, | |
| "learning_rate": 8.529633447919622e-07, | |
| "loss": 0.3253, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.8870967741935484, | |
| "grad_norm": 10.057149887084961, | |
| "learning_rate": 8.450029332151406e-07, | |
| "loss": 0.3448, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.907258064516129, | |
| "grad_norm": 12.985930442810059, | |
| "learning_rate": 8.368719839708018e-07, | |
| "loss": 0.4045, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.9274193548387096, | |
| "grad_norm": 10.555837631225586, | |
| "learning_rate": 8.285745162512056e-07, | |
| "loss": 0.3673, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.9475806451612904, | |
| "grad_norm": 9.865274429321289, | |
| "learning_rate": 8.20114631560006e-07, | |
| "loss": 0.3876, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.967741935483871, | |
| "grad_norm": 11.42111587524414, | |
| "learning_rate": 8.114965116848454e-07, | |
| "loss": 0.3165, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.9879032258064516, | |
| "grad_norm": 8.909076690673828, | |
| "learning_rate": 8.02724416630264e-07, | |
| "loss": 0.342, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.0080645161290323, | |
| "grad_norm": 10.668339729309082, | |
| "learning_rate": 7.938026825119463e-07, | |
| "loss": 0.3457, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.028225806451613, | |
| "grad_norm": 12.418227195739746, | |
| "learning_rate": 7.847357194133442e-07, | |
| "loss": 0.2843, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.0483870967741935, | |
| "grad_norm": 10.189166069030762, | |
| "learning_rate": 7.755280092057391e-07, | |
| "loss": 0.2967, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.0685483870967742, | |
| "grad_norm": 9.147650718688965, | |
| "learning_rate": 7.661841033328169e-07, | |
| "loss": 0.2698, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.0887096774193548, | |
| "grad_norm": 10.935348510742188, | |
| "learning_rate": 7.567086205608533e-07, | |
| "loss": 0.3049, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.1088709677419355, | |
| "grad_norm": 9.401994705200195, | |
| "learning_rate": 7.471062446956225e-07, | |
| "loss": 0.2689, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.129032258064516, | |
| "grad_norm": 13.119884490966797, | |
| "learning_rate": 7.373817222671535e-07, | |
| "loss": 0.2712, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.1491935483870968, | |
| "grad_norm": 9.512726783752441, | |
| "learning_rate": 7.275398601834835e-07, | |
| "loss": 0.2703, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.1693548387096775, | |
| "grad_norm": 12.284181594848633, | |
| "learning_rate": 7.175855233545667e-07, | |
| "loss": 0.2399, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.189516129032258, | |
| "grad_norm": 9.64366626739502, | |
| "learning_rate": 7.075236322875087e-07, | |
| "loss": 0.2685, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.2096774193548387, | |
| "grad_norm": 11.060663223266602, | |
| "learning_rate": 6.973591606543226e-07, | |
| "loss": 0.2758, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.2298387096774193, | |
| "grad_norm": 11.46677303314209, | |
| "learning_rate": 6.870971328334037e-07, | |
| "loss": 0.2942, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 11.921086311340332, | |
| "learning_rate": 6.767426214259388e-07, | |
| "loss": 0.2779, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.2701612903225805, | |
| "grad_norm": 12.505317687988281, | |
| "learning_rate": 6.663007447484806e-07, | |
| "loss": 0.2561, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.2903225806451613, | |
| "grad_norm": 8.572080612182617, | |
| "learning_rate": 6.557766643029226e-07, | |
| "loss": 0.2456, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.310483870967742, | |
| "grad_norm": 9.482403755187988, | |
| "learning_rate": 6.451755822251284e-07, | |
| "loss": 0.2666, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.3306451612903225, | |
| "grad_norm": 11.761874198913574, | |
| "learning_rate": 6.345027387134749e-07, | |
| "loss": 0.2781, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.3508064516129032, | |
| "grad_norm": 9.536797523498535, | |
| "learning_rate": 6.237634094385813e-07, | |
| "loss": 0.2528, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.370967741935484, | |
| "grad_norm": 10.880298614501953, | |
| "learning_rate": 6.129629029355033e-07, | |
| "loss": 0.3138, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.3911290322580645, | |
| "grad_norm": 11.278773307800293, | |
| "learning_rate": 6.02106557979682e-07, | |
| "loss": 0.301, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.4112903225806452, | |
| "grad_norm": 13.422077178955078, | |
| "learning_rate": 5.91199740947946e-07, | |
| "loss": 0.2827, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.4314516129032258, | |
| "grad_norm": 11.68918228149414, | |
| "learning_rate": 5.802478431658682e-07, | |
| "loss": 0.2921, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.4516129032258065, | |
| "grad_norm": 9.534116744995117, | |
| "learning_rate": 5.692562782427916e-07, | |
| "loss": 0.2727, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.471774193548387, | |
| "grad_norm": 13.032623291015625, | |
| "learning_rate": 5.582304793958399e-07, | |
| "loss": 0.2581, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.4919354838709677, | |
| "grad_norm": 10.85723876953125, | |
| "learning_rate": 5.471758967642341e-07, | |
| "loss": 0.2832, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.5120967741935485, | |
| "grad_norm": 12.404828071594238, | |
| "learning_rate": 5.36097994715248e-07, | |
| "loss": 0.2715, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.532258064516129, | |
| "grad_norm": 10.57315731048584, | |
| "learning_rate": 5.250022491431259e-07, | |
| "loss": 0.2422, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.5524193548387095, | |
| "grad_norm": 11.932204246520996, | |
| "learning_rate": 5.138941447623065e-07, | |
| "loss": 0.2868, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.5725806451612905, | |
| "grad_norm": 10.36052131652832, | |
| "learning_rate": 5.027791723962854e-07, | |
| "loss": 0.2738, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.592741935483871, | |
| "grad_norm": 14.13831615447998, | |
| "learning_rate": 4.916628262634568e-07, | |
| "loss": 0.2561, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.6129032258064515, | |
| "grad_norm": 10.870752334594727, | |
| "learning_rate": 4.805506012612792e-07, | |
| "loss": 0.2675, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.6330645161290323, | |
| "grad_norm": 12.759139060974121, | |
| "learning_rate": 4.694479902501033e-07, | |
| "loss": 0.2526, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.653225806451613, | |
| "grad_norm": 10.99550724029541, | |
| "learning_rate": 4.5836048133800864e-07, | |
| "loss": 0.2441, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.6733870967741935, | |
| "grad_norm": 10.80727767944336, | |
| "learning_rate": 4.4729355516798814e-07, | |
| "loss": 0.2661, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.6935483870967742, | |
| "grad_norm": 10.887651443481445, | |
| "learning_rate": 4.362526822088228e-07, | |
| "loss": 0.2845, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.713709677419355, | |
| "grad_norm": 12.84078311920166, | |
| "learning_rate": 4.252433200509868e-07, | |
| "loss": 0.2754, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.7338709677419355, | |
| "grad_norm": 14.602743148803711, | |
| "learning_rate": 4.142709107089171e-07, | |
| "loss": 0.2506, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.754032258064516, | |
| "grad_norm": 12.086965560913086, | |
| "learning_rate": 4.033408779309819e-07, | |
| "loss": 0.255, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.7741935483870968, | |
| "grad_norm": 11.939308166503906, | |
| "learning_rate": 3.9245862451848093e-07, | |
| "loss": 0.2669, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.7943548387096775, | |
| "grad_norm": 13.408591270446777, | |
| "learning_rate": 3.816295296549967e-07, | |
| "loss": 0.2882, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.814516129032258, | |
| "grad_norm": 13.375335693359375, | |
| "learning_rate": 3.708589462474221e-07, | |
| "loss": 0.2816, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.8346774193548387, | |
| "grad_norm": 11.074604034423828, | |
| "learning_rate": 3.6015219827997677e-07, | |
| "loss": 0.2672, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.8548387096774195, | |
| "grad_norm": 11.15030288696289, | |
| "learning_rate": 3.4951457818251934e-07, | |
| "loss": 0.2586, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.875, | |
| "grad_norm": 13.091116905212402, | |
| "learning_rate": 3.3895134421445805e-07, | |
| "loss": 0.2383, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.8951612903225805, | |
| "grad_norm": 10.90065860748291, | |
| "learning_rate": 3.2846771786555073e-07, | |
| "loss": 0.2892, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.9153225806451613, | |
| "grad_norm": 11.27858829498291, | |
| "learning_rate": 3.180688812748825e-07, | |
| "loss": 0.2704, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.935483870967742, | |
| "grad_norm": 11.70596694946289, | |
| "learning_rate": 3.0775997466929315e-07, | |
| "loss": 0.2994, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.9556451612903225, | |
| "grad_norm": 14.318155288696289, | |
| "learning_rate": 2.9754609382252244e-07, | |
| "loss": 0.2765, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.9758064516129032, | |
| "grad_norm": 9.653993606567383, | |
| "learning_rate": 2.874322875363283e-07, | |
| "loss": 0.2346, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.995967741935484, | |
| "grad_norm": 10.425935745239258, | |
| "learning_rate": 2.774235551448265e-07, | |
| "loss": 0.2732, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.0161290322580645, | |
| "grad_norm": 12.588045120239258, | |
| "learning_rate": 2.6752484404327735e-07, | |
| "loss": 0.1989, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.036290322580645, | |
| "grad_norm": 13.573112487792969, | |
| "learning_rate": 2.5774104724255187e-07, | |
| "loss": 0.1974, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.056451612903226, | |
| "grad_norm": 10.552323341369629, | |
| "learning_rate": 2.480770009504773e-07, | |
| "loss": 0.2043, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.0766129032258065, | |
| "grad_norm": 13.050751686096191, | |
| "learning_rate": 2.3853748218125996e-07, | |
| "loss": 0.2061, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.096774193548387, | |
| "grad_norm": 11.474674224853516, | |
| "learning_rate": 2.2912720639417154e-07, | |
| "loss": 0.185, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.1169354838709675, | |
| "grad_norm": 11.878608703613281, | |
| "learning_rate": 2.1985082516265995e-07, | |
| "loss": 0.2002, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.1370967741935485, | |
| "grad_norm": 14.378472328186035, | |
| "learning_rate": 2.1071292387503858e-07, | |
| "loss": 0.1829, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.157258064516129, | |
| "grad_norm": 10.852553367614746, | |
| "learning_rate": 2.0171801946789414e-07, | |
| "loss": 0.1768, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.1774193548387095, | |
| "grad_norm": 11.97855281829834, | |
| "learning_rate": 1.9287055819332965e-07, | |
| "loss": 0.1846, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.1975806451612905, | |
| "grad_norm": 11.896756172180176, | |
| "learning_rate": 1.84174913421145e-07, | |
| "loss": 0.2036, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.217741935483871, | |
| "grad_norm": 12.411077499389648, | |
| "learning_rate": 1.7563538347704783e-07, | |
| "loss": 0.2194, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.2379032258064515, | |
| "grad_norm": 10.730416297912598, | |
| "learning_rate": 1.6725618951795673e-07, | |
| "loss": 0.1853, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.258064516129032, | |
| "grad_norm": 11.932442665100098, | |
| "learning_rate": 1.5904147344544928e-07, | |
| "loss": 0.2063, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.278225806451613, | |
| "grad_norm": 11.115213394165039, | |
| "learning_rate": 1.5099529585838827e-07, | |
| "loss": 0.2214, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.2983870967741935, | |
| "grad_norm": 11.2450532913208, | |
| "learning_rate": 1.4312163404573623e-07, | |
| "loss": 0.2046, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.318548387096774, | |
| "grad_norm": 11.3887357711792, | |
| "learning_rate": 1.354243800205483e-07, | |
| "loss": 0.233, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.338709677419355, | |
| "grad_norm": 11.942183494567871, | |
| "learning_rate": 1.279073385961217e-07, | |
| "loss": 0.1868, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.3588709677419355, | |
| "grad_norm": 12.46717357635498, | |
| "learning_rate": 1.2057422550524504e-07, | |
| "loss": 0.2017, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.379032258064516, | |
| "grad_norm": 16.602731704711914, | |
| "learning_rate": 1.1342866556348302e-07, | |
| "loss": 0.2178, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.399193548387097, | |
| "grad_norm": 12.216778755187988, | |
| "learning_rate": 1.0647419087740117e-07, | |
| "loss": 0.1798, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.4193548387096775, | |
| "grad_norm": 9.579974174499512, | |
| "learning_rate": 9.971423909861803e-08, | |
| "loss": 0.2114, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.439516129032258, | |
| "grad_norm": 12.557072639465332, | |
| "learning_rate": 9.315215172454688e-08, | |
| "loss": 0.1898, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.4596774193548385, | |
| "grad_norm": 10.30516242980957, | |
| "learning_rate": 8.679117244666706e-08, | |
| "loss": 0.1924, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.4798387096774195, | |
| "grad_norm": 12.587124824523926, | |
| "learning_rate": 8.063444554714172e-08, | |
| "loss": 0.2189, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 11.642440795898438, | |
| "learning_rate": 7.468501434457469e-08, | |
| "loss": 0.2279, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.5201612903225805, | |
| "grad_norm": 9.209700584411621, | |
| "learning_rate": 6.894581968967367e-08, | |
| "loss": 0.1969, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.540322580645161, | |
| "grad_norm": 9.485260009765625, | |
| "learning_rate": 6.341969851156492e-08, | |
| "loss": 0.1784, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.560483870967742, | |
| "grad_norm": 8.158933639526367, | |
| "learning_rate": 5.810938241547669e-08, | |
| "loss": 0.1612, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.5806451612903225, | |
| "grad_norm": 10.125198364257812, | |
| "learning_rate": 5.301749633248531e-08, | |
| "loss": 0.2255, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.600806451612903, | |
| "grad_norm": 10.74553394317627, | |
| "learning_rate": 4.814655722199096e-08, | |
| "loss": 0.191, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.620967741935484, | |
| "grad_norm": 12.781230926513672, | |
| "learning_rate": 4.349897282756487e-08, | |
| "loss": 0.2198, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.6411290322580645, | |
| "grad_norm": 14.817058563232422, | |
| "learning_rate": 3.90770404867829e-08, | |
| "loss": 0.1992, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.661290322580645, | |
| "grad_norm": 10.511248588562012, | |
| "learning_rate": 3.4882945995633073e-08, | |
| "loss": 0.2142, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.681451612903226, | |
| "grad_norm": 12.40715503692627, | |
| "learning_rate": 3.0918762528059805e-08, | |
| "loss": 0.1963, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.7016129032258065, | |
| "grad_norm": 14.832979202270508, | |
| "learning_rate": 2.718644961117744e-08, | |
| "loss": 0.2053, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.721774193548387, | |
| "grad_norm": 12.077789306640625, | |
| "learning_rate": 2.368785215666064e-08, | |
| "loss": 0.1808, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.741935483870968, | |
| "grad_norm": 12.798100471496582, | |
| "learning_rate": 2.042469954879006e-08, | |
| "loss": 0.2104, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.7620967741935485, | |
| "grad_norm": 11.502704620361328, | |
| "learning_rate": 1.7398604789604033e-08, | |
| "loss": 0.2131, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.782258064516129, | |
| "grad_norm": 13.160285949707031, | |
| "learning_rate": 1.4611063701578886e-08, | |
| "loss": 0.2023, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.8024193548387095, | |
| "grad_norm": 14.090004920959473, | |
| "learning_rate": 1.2063454188232348e-08, | |
| "loss": 0.2045, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.8225806451612905, | |
| "grad_norm": 11.760912895202637, | |
| "learning_rate": 9.757035553014493e-09, | |
| "loss": 0.2063, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.842741935483871, | |
| "grad_norm": 13.928801536560059, | |
| "learning_rate": 7.692947876824728e-09, | |
| "loss": 0.1907, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.8629032258064515, | |
| "grad_norm": 11.353324890136719, | |
| "learning_rate": 5.872211454460596e-09, | |
| "loss": 0.2084, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.883064516129032, | |
| "grad_norm": 10.764039039611816, | |
| "learning_rate": 4.295726290277579e-09, | |
| "loss": 0.2057, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.903225806451613, | |
| "grad_norm": 11.042613983154297, | |
| "learning_rate": 2.964271653310646e-09, | |
| "loss": 0.2123, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.9233870967741935, | |
| "grad_norm": 15.109175682067871, | |
| "learning_rate": 1.878505692074872e-09, | |
| "loss": 0.1824, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.943548387096774, | |
| "grad_norm": 12.680368423461914, | |
| "learning_rate": 1.0389651092375662e-09, | |
| "loss": 0.1996, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.963709677419355, | |
| "grad_norm": 11.96649169921875, | |
| "learning_rate": 4.4606489632198485e-10, | |
| "loss": 0.2064, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.9838709677419355, | |
| "grad_norm": 11.979409217834473, | |
| "learning_rate": 1.0009812857370015e-10, | |
| "loss": 0.1779, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 1488, | |
| "total_flos": 1.1291739690022994e+18, | |
| "train_loss": 0.3318688049111315, | |
| "train_runtime": 21295.3074, | |
| "train_samples_per_second": 0.559, | |
| "train_steps_per_second": 0.07 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1488, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.1291739690022994e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |