ChaoHuangCS's picture
Upload SFT weights only
252b3ef verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 10000000,
"global_step": 1488,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.020161290322580645,
"grad_norm": 104.71724700927734,
"learning_rate": 1.2e-07,
"loss": 1.4171,
"step": 10
},
{
"epoch": 0.04032258064516129,
"grad_norm": 19.342660903930664,
"learning_rate": 2.533333333333333e-07,
"loss": 1.4035,
"step": 20
},
{
"epoch": 0.06048387096774194,
"grad_norm": 16.795289993286133,
"learning_rate": 3.8666666666666664e-07,
"loss": 1.25,
"step": 30
},
{
"epoch": 0.08064516129032258,
"grad_norm": 12.612614631652832,
"learning_rate": 5.2e-07,
"loss": 1.0432,
"step": 40
},
{
"epoch": 0.10080645161290322,
"grad_norm": 14.135688781738281,
"learning_rate": 6.533333333333333e-07,
"loss": 0.9721,
"step": 50
},
{
"epoch": 0.12096774193548387,
"grad_norm": 15.069091796875,
"learning_rate": 7.866666666666666e-07,
"loss": 0.8305,
"step": 60
},
{
"epoch": 0.14112903225806453,
"grad_norm": 12.399056434631348,
"learning_rate": 9.2e-07,
"loss": 0.7259,
"step": 70
},
{
"epoch": 0.16129032258064516,
"grad_norm": 8.677566528320312,
"learning_rate": 9.999802270007193e-07,
"loss": 0.673,
"step": 80
},
{
"epoch": 0.1814516129032258,
"grad_norm": 10.323257446289062,
"learning_rate": 9.997577987186727e-07,
"loss": 0.6432,
"step": 90
},
{
"epoch": 0.20161290322580644,
"grad_norm": 10.148702621459961,
"learning_rate": 9.992883362200682e-07,
"loss": 0.6266,
"step": 100
},
{
"epoch": 0.2217741935483871,
"grad_norm": 11.491229057312012,
"learning_rate": 9.985720715639167e-07,
"loss": 0.5784,
"step": 110
},
{
"epoch": 0.24193548387096775,
"grad_norm": 10.31007194519043,
"learning_rate": 9.976093588054797e-07,
"loss": 0.5034,
"step": 120
},
{
"epoch": 0.2620967741935484,
"grad_norm": 8.234132766723633,
"learning_rate": 9.964006738212574e-07,
"loss": 0.5401,
"step": 130
},
{
"epoch": 0.28225806451612906,
"grad_norm": 11.453391075134277,
"learning_rate": 9.949466140737583e-07,
"loss": 0.5411,
"step": 140
},
{
"epoch": 0.3024193548387097,
"grad_norm": 11.507453918457031,
"learning_rate": 9.932478983161692e-07,
"loss": 0.5281,
"step": 150
},
{
"epoch": 0.3225806451612903,
"grad_norm": 12.632501602172852,
"learning_rate": 9.913053662370705e-07,
"loss": 0.4564,
"step": 160
},
{
"epoch": 0.34274193548387094,
"grad_norm": 9.758938789367676,
"learning_rate": 9.891199780453699e-07,
"loss": 0.4815,
"step": 170
},
{
"epoch": 0.3629032258064516,
"grad_norm": 11.363844871520996,
"learning_rate": 9.866928139956655e-07,
"loss": 0.5007,
"step": 180
},
{
"epoch": 0.38306451612903225,
"grad_norm": 10.484410285949707,
"learning_rate": 9.840250738542662e-07,
"loss": 0.4611,
"step": 190
},
{
"epoch": 0.4032258064516129,
"grad_norm": 12.169049263000488,
"learning_rate": 9.811180763061378e-07,
"loss": 0.4685,
"step": 200
},
{
"epoch": 0.42338709677419356,
"grad_norm": 9.939359664916992,
"learning_rate": 9.77973258303067e-07,
"loss": 0.4794,
"step": 210
},
{
"epoch": 0.4435483870967742,
"grad_norm": 9.921211242675781,
"learning_rate": 9.745921743533651e-07,
"loss": 0.3909,
"step": 220
},
{
"epoch": 0.4637096774193548,
"grad_norm": 11.779664993286133,
"learning_rate": 9.709764957534615e-07,
"loss": 0.4505,
"step": 230
},
{
"epoch": 0.4838709677419355,
"grad_norm": 9.654085159301758,
"learning_rate": 9.671280097617692e-07,
"loss": 0.4118,
"step": 240
},
{
"epoch": 0.5040322580645161,
"grad_norm": 12.259678840637207,
"learning_rate": 9.63048618715229e-07,
"loss": 0.4373,
"step": 250
},
{
"epoch": 0.5241935483870968,
"grad_norm": 12.087785720825195,
"learning_rate": 9.58740339088969e-07,
"loss": 0.3508,
"step": 260
},
{
"epoch": 0.5443548387096774,
"grad_norm": 9.039093017578125,
"learning_rate": 9.542053004995452e-07,
"loss": 0.3848,
"step": 270
},
{
"epoch": 0.5645161290322581,
"grad_norm": 10.91696834564209,
"learning_rate": 9.494457446522555e-07,
"loss": 0.3858,
"step": 280
},
{
"epoch": 0.5846774193548387,
"grad_norm": 11.28246021270752,
"learning_rate": 9.444640242330468e-07,
"loss": 0.4052,
"step": 290
},
{
"epoch": 0.6048387096774194,
"grad_norm": 9.38158893585205,
"learning_rate": 9.392626017455638e-07,
"loss": 0.3984,
"step": 300
},
{
"epoch": 0.625,
"grad_norm": 13.301438331604004,
"learning_rate": 9.338440482939145e-07,
"loss": 0.3586,
"step": 310
},
{
"epoch": 0.6451612903225806,
"grad_norm": 12.870404243469238,
"learning_rate": 9.282110423117524e-07,
"loss": 0.3641,
"step": 320
},
{
"epoch": 0.6653225806451613,
"grad_norm": 11.24843978881836,
"learning_rate": 9.223663682383066e-07,
"loss": 0.3684,
"step": 330
},
{
"epoch": 0.6854838709677419,
"grad_norm": 12.58388614654541,
"learning_rate": 9.163129151420105e-07,
"loss": 0.3954,
"step": 340
},
{
"epoch": 0.7056451612903226,
"grad_norm": 10.611601829528809,
"learning_rate": 9.100536752924135e-07,
"loss": 0.3774,
"step": 350
},
{
"epoch": 0.7258064516129032,
"grad_norm": 12.370527267456055,
"learning_rate": 9.035917426810781e-07,
"loss": 0.3682,
"step": 360
},
{
"epoch": 0.7459677419354839,
"grad_norm": 12.168606758117676,
"learning_rate": 8.969303114921956e-07,
"loss": 0.3929,
"step": 370
},
{
"epoch": 0.7661290322580645,
"grad_norm": 10.513528823852539,
"learning_rate": 8.900726745236751e-07,
"loss": 0.3525,
"step": 380
},
{
"epoch": 0.7862903225806451,
"grad_norm": 10.08989429473877,
"learning_rate": 8.83022221559489e-07,
"loss": 0.3961,
"step": 390
},
{
"epoch": 0.8064516129032258,
"grad_norm": 10.106974601745605,
"learning_rate": 8.757824376940745e-07,
"loss": 0.3609,
"step": 400
},
{
"epoch": 0.8266129032258065,
"grad_norm": 12.857497215270996,
"learning_rate": 8.68356901609625e-07,
"loss": 0.404,
"step": 410
},
{
"epoch": 0.8467741935483871,
"grad_norm": 12.46678352355957,
"learning_rate": 8.60749283807119e-07,
"loss": 0.3438,
"step": 420
},
{
"epoch": 0.8669354838709677,
"grad_norm": 14.995087623596191,
"learning_rate": 8.529633447919622e-07,
"loss": 0.3253,
"step": 430
},
{
"epoch": 0.8870967741935484,
"grad_norm": 10.057149887084961,
"learning_rate": 8.450029332151406e-07,
"loss": 0.3448,
"step": 440
},
{
"epoch": 0.907258064516129,
"grad_norm": 12.985930442810059,
"learning_rate": 8.368719839708018e-07,
"loss": 0.4045,
"step": 450
},
{
"epoch": 0.9274193548387096,
"grad_norm": 10.555837631225586,
"learning_rate": 8.285745162512056e-07,
"loss": 0.3673,
"step": 460
},
{
"epoch": 0.9475806451612904,
"grad_norm": 9.865274429321289,
"learning_rate": 8.20114631560006e-07,
"loss": 0.3876,
"step": 470
},
{
"epoch": 0.967741935483871,
"grad_norm": 11.42111587524414,
"learning_rate": 8.114965116848454e-07,
"loss": 0.3165,
"step": 480
},
{
"epoch": 0.9879032258064516,
"grad_norm": 8.909076690673828,
"learning_rate": 8.02724416630264e-07,
"loss": 0.342,
"step": 490
},
{
"epoch": 1.0080645161290323,
"grad_norm": 10.668339729309082,
"learning_rate": 7.938026825119463e-07,
"loss": 0.3457,
"step": 500
},
{
"epoch": 1.028225806451613,
"grad_norm": 12.418227195739746,
"learning_rate": 7.847357194133442e-07,
"loss": 0.2843,
"step": 510
},
{
"epoch": 1.0483870967741935,
"grad_norm": 10.189166069030762,
"learning_rate": 7.755280092057391e-07,
"loss": 0.2967,
"step": 520
},
{
"epoch": 1.0685483870967742,
"grad_norm": 9.147650718688965,
"learning_rate": 7.661841033328169e-07,
"loss": 0.2698,
"step": 530
},
{
"epoch": 1.0887096774193548,
"grad_norm": 10.935348510742188,
"learning_rate": 7.567086205608533e-07,
"loss": 0.3049,
"step": 540
},
{
"epoch": 1.1088709677419355,
"grad_norm": 9.401994705200195,
"learning_rate": 7.471062446956225e-07,
"loss": 0.2689,
"step": 550
},
{
"epoch": 1.129032258064516,
"grad_norm": 13.119884490966797,
"learning_rate": 7.373817222671535e-07,
"loss": 0.2712,
"step": 560
},
{
"epoch": 1.1491935483870968,
"grad_norm": 9.512726783752441,
"learning_rate": 7.275398601834835e-07,
"loss": 0.2703,
"step": 570
},
{
"epoch": 1.1693548387096775,
"grad_norm": 12.284181594848633,
"learning_rate": 7.175855233545667e-07,
"loss": 0.2399,
"step": 580
},
{
"epoch": 1.189516129032258,
"grad_norm": 9.64366626739502,
"learning_rate": 7.075236322875087e-07,
"loss": 0.2685,
"step": 590
},
{
"epoch": 1.2096774193548387,
"grad_norm": 11.060663223266602,
"learning_rate": 6.973591606543226e-07,
"loss": 0.2758,
"step": 600
},
{
"epoch": 1.2298387096774193,
"grad_norm": 11.46677303314209,
"learning_rate": 6.870971328334037e-07,
"loss": 0.2942,
"step": 610
},
{
"epoch": 1.25,
"grad_norm": 11.921086311340332,
"learning_rate": 6.767426214259388e-07,
"loss": 0.2779,
"step": 620
},
{
"epoch": 1.2701612903225805,
"grad_norm": 12.505317687988281,
"learning_rate": 6.663007447484806e-07,
"loss": 0.2561,
"step": 630
},
{
"epoch": 1.2903225806451613,
"grad_norm": 8.572080612182617,
"learning_rate": 6.557766643029226e-07,
"loss": 0.2456,
"step": 640
},
{
"epoch": 1.310483870967742,
"grad_norm": 9.482403755187988,
"learning_rate": 6.451755822251284e-07,
"loss": 0.2666,
"step": 650
},
{
"epoch": 1.3306451612903225,
"grad_norm": 11.761874198913574,
"learning_rate": 6.345027387134749e-07,
"loss": 0.2781,
"step": 660
},
{
"epoch": 1.3508064516129032,
"grad_norm": 9.536797523498535,
"learning_rate": 6.237634094385813e-07,
"loss": 0.2528,
"step": 670
},
{
"epoch": 1.370967741935484,
"grad_norm": 10.880298614501953,
"learning_rate": 6.129629029355033e-07,
"loss": 0.3138,
"step": 680
},
{
"epoch": 1.3911290322580645,
"grad_norm": 11.278773307800293,
"learning_rate": 6.02106557979682e-07,
"loss": 0.301,
"step": 690
},
{
"epoch": 1.4112903225806452,
"grad_norm": 13.422077178955078,
"learning_rate": 5.91199740947946e-07,
"loss": 0.2827,
"step": 700
},
{
"epoch": 1.4314516129032258,
"grad_norm": 11.68918228149414,
"learning_rate": 5.802478431658682e-07,
"loss": 0.2921,
"step": 710
},
{
"epoch": 1.4516129032258065,
"grad_norm": 9.534116744995117,
"learning_rate": 5.692562782427916e-07,
"loss": 0.2727,
"step": 720
},
{
"epoch": 1.471774193548387,
"grad_norm": 13.032623291015625,
"learning_rate": 5.582304793958399e-07,
"loss": 0.2581,
"step": 730
},
{
"epoch": 1.4919354838709677,
"grad_norm": 10.85723876953125,
"learning_rate": 5.471758967642341e-07,
"loss": 0.2832,
"step": 740
},
{
"epoch": 1.5120967741935485,
"grad_norm": 12.404828071594238,
"learning_rate": 5.36097994715248e-07,
"loss": 0.2715,
"step": 750
},
{
"epoch": 1.532258064516129,
"grad_norm": 10.57315731048584,
"learning_rate": 5.250022491431259e-07,
"loss": 0.2422,
"step": 760
},
{
"epoch": 1.5524193548387095,
"grad_norm": 11.932204246520996,
"learning_rate": 5.138941447623065e-07,
"loss": 0.2868,
"step": 770
},
{
"epoch": 1.5725806451612905,
"grad_norm": 10.36052131652832,
"learning_rate": 5.027791723962854e-07,
"loss": 0.2738,
"step": 780
},
{
"epoch": 1.592741935483871,
"grad_norm": 14.13831615447998,
"learning_rate": 4.916628262634568e-07,
"loss": 0.2561,
"step": 790
},
{
"epoch": 1.6129032258064515,
"grad_norm": 10.870752334594727,
"learning_rate": 4.805506012612792e-07,
"loss": 0.2675,
"step": 800
},
{
"epoch": 1.6330645161290323,
"grad_norm": 12.759139060974121,
"learning_rate": 4.694479902501033e-07,
"loss": 0.2526,
"step": 810
},
{
"epoch": 1.653225806451613,
"grad_norm": 10.99550724029541,
"learning_rate": 4.5836048133800864e-07,
"loss": 0.2441,
"step": 820
},
{
"epoch": 1.6733870967741935,
"grad_norm": 10.80727767944336,
"learning_rate": 4.4729355516798814e-07,
"loss": 0.2661,
"step": 830
},
{
"epoch": 1.6935483870967742,
"grad_norm": 10.887651443481445,
"learning_rate": 4.362526822088228e-07,
"loss": 0.2845,
"step": 840
},
{
"epoch": 1.713709677419355,
"grad_norm": 12.84078311920166,
"learning_rate": 4.252433200509868e-07,
"loss": 0.2754,
"step": 850
},
{
"epoch": 1.7338709677419355,
"grad_norm": 14.602743148803711,
"learning_rate": 4.142709107089171e-07,
"loss": 0.2506,
"step": 860
},
{
"epoch": 1.754032258064516,
"grad_norm": 12.086965560913086,
"learning_rate": 4.033408779309819e-07,
"loss": 0.255,
"step": 870
},
{
"epoch": 1.7741935483870968,
"grad_norm": 11.939308166503906,
"learning_rate": 3.9245862451848093e-07,
"loss": 0.2669,
"step": 880
},
{
"epoch": 1.7943548387096775,
"grad_norm": 13.408591270446777,
"learning_rate": 3.816295296549967e-07,
"loss": 0.2882,
"step": 890
},
{
"epoch": 1.814516129032258,
"grad_norm": 13.375335693359375,
"learning_rate": 3.708589462474221e-07,
"loss": 0.2816,
"step": 900
},
{
"epoch": 1.8346774193548387,
"grad_norm": 11.074604034423828,
"learning_rate": 3.6015219827997677e-07,
"loss": 0.2672,
"step": 910
},
{
"epoch": 1.8548387096774195,
"grad_norm": 11.15030288696289,
"learning_rate": 3.4951457818251934e-07,
"loss": 0.2586,
"step": 920
},
{
"epoch": 1.875,
"grad_norm": 13.091116905212402,
"learning_rate": 3.3895134421445805e-07,
"loss": 0.2383,
"step": 930
},
{
"epoch": 1.8951612903225805,
"grad_norm": 10.90065860748291,
"learning_rate": 3.2846771786555073e-07,
"loss": 0.2892,
"step": 940
},
{
"epoch": 1.9153225806451613,
"grad_norm": 11.27858829498291,
"learning_rate": 3.180688812748825e-07,
"loss": 0.2704,
"step": 950
},
{
"epoch": 1.935483870967742,
"grad_norm": 11.70596694946289,
"learning_rate": 3.0775997466929315e-07,
"loss": 0.2994,
"step": 960
},
{
"epoch": 1.9556451612903225,
"grad_norm": 14.318155288696289,
"learning_rate": 2.9754609382252244e-07,
"loss": 0.2765,
"step": 970
},
{
"epoch": 1.9758064516129032,
"grad_norm": 9.653993606567383,
"learning_rate": 2.874322875363283e-07,
"loss": 0.2346,
"step": 980
},
{
"epoch": 1.995967741935484,
"grad_norm": 10.425935745239258,
"learning_rate": 2.774235551448265e-07,
"loss": 0.2732,
"step": 990
},
{
"epoch": 2.0161290322580645,
"grad_norm": 12.588045120239258,
"learning_rate": 2.6752484404327735e-07,
"loss": 0.1989,
"step": 1000
},
{
"epoch": 2.036290322580645,
"grad_norm": 13.573112487792969,
"learning_rate": 2.5774104724255187e-07,
"loss": 0.1974,
"step": 1010
},
{
"epoch": 2.056451612903226,
"grad_norm": 10.552323341369629,
"learning_rate": 2.480770009504773e-07,
"loss": 0.2043,
"step": 1020
},
{
"epoch": 2.0766129032258065,
"grad_norm": 13.050751686096191,
"learning_rate": 2.3853748218125996e-07,
"loss": 0.2061,
"step": 1030
},
{
"epoch": 2.096774193548387,
"grad_norm": 11.474674224853516,
"learning_rate": 2.2912720639417154e-07,
"loss": 0.185,
"step": 1040
},
{
"epoch": 2.1169354838709675,
"grad_norm": 11.878608703613281,
"learning_rate": 2.1985082516265995e-07,
"loss": 0.2002,
"step": 1050
},
{
"epoch": 2.1370967741935485,
"grad_norm": 14.378472328186035,
"learning_rate": 2.1071292387503858e-07,
"loss": 0.1829,
"step": 1060
},
{
"epoch": 2.157258064516129,
"grad_norm": 10.852553367614746,
"learning_rate": 2.0171801946789414e-07,
"loss": 0.1768,
"step": 1070
},
{
"epoch": 2.1774193548387095,
"grad_norm": 11.97855281829834,
"learning_rate": 1.9287055819332965e-07,
"loss": 0.1846,
"step": 1080
},
{
"epoch": 2.1975806451612905,
"grad_norm": 11.896756172180176,
"learning_rate": 1.84174913421145e-07,
"loss": 0.2036,
"step": 1090
},
{
"epoch": 2.217741935483871,
"grad_norm": 12.411077499389648,
"learning_rate": 1.7563538347704783e-07,
"loss": 0.2194,
"step": 1100
},
{
"epoch": 2.2379032258064515,
"grad_norm": 10.730416297912598,
"learning_rate": 1.6725618951795673e-07,
"loss": 0.1853,
"step": 1110
},
{
"epoch": 2.258064516129032,
"grad_norm": 11.932442665100098,
"learning_rate": 1.5904147344544928e-07,
"loss": 0.2063,
"step": 1120
},
{
"epoch": 2.278225806451613,
"grad_norm": 11.115213394165039,
"learning_rate": 1.5099529585838827e-07,
"loss": 0.2214,
"step": 1130
},
{
"epoch": 2.2983870967741935,
"grad_norm": 11.2450532913208,
"learning_rate": 1.4312163404573623e-07,
"loss": 0.2046,
"step": 1140
},
{
"epoch": 2.318548387096774,
"grad_norm": 11.3887357711792,
"learning_rate": 1.354243800205483e-07,
"loss": 0.233,
"step": 1150
},
{
"epoch": 2.338709677419355,
"grad_norm": 11.942183494567871,
"learning_rate": 1.279073385961217e-07,
"loss": 0.1868,
"step": 1160
},
{
"epoch": 2.3588709677419355,
"grad_norm": 12.46717357635498,
"learning_rate": 1.2057422550524504e-07,
"loss": 0.2017,
"step": 1170
},
{
"epoch": 2.379032258064516,
"grad_norm": 16.602731704711914,
"learning_rate": 1.1342866556348302e-07,
"loss": 0.2178,
"step": 1180
},
{
"epoch": 2.399193548387097,
"grad_norm": 12.216778755187988,
"learning_rate": 1.0647419087740117e-07,
"loss": 0.1798,
"step": 1190
},
{
"epoch": 2.4193548387096775,
"grad_norm": 9.579974174499512,
"learning_rate": 9.971423909861803e-08,
"loss": 0.2114,
"step": 1200
},
{
"epoch": 2.439516129032258,
"grad_norm": 12.557072639465332,
"learning_rate": 9.315215172454688e-08,
"loss": 0.1898,
"step": 1210
},
{
"epoch": 2.4596774193548385,
"grad_norm": 10.30516242980957,
"learning_rate": 8.679117244666706e-08,
"loss": 0.1924,
"step": 1220
},
{
"epoch": 2.4798387096774195,
"grad_norm": 12.587124824523926,
"learning_rate": 8.063444554714172e-08,
"loss": 0.2189,
"step": 1230
},
{
"epoch": 2.5,
"grad_norm": 11.642440795898438,
"learning_rate": 7.468501434457469e-08,
"loss": 0.2279,
"step": 1240
},
{
"epoch": 2.5201612903225805,
"grad_norm": 9.209700584411621,
"learning_rate": 6.894581968967367e-08,
"loss": 0.1969,
"step": 1250
},
{
"epoch": 2.540322580645161,
"grad_norm": 9.485260009765625,
"learning_rate": 6.341969851156492e-08,
"loss": 0.1784,
"step": 1260
},
{
"epoch": 2.560483870967742,
"grad_norm": 8.158933639526367,
"learning_rate": 5.810938241547669e-08,
"loss": 0.1612,
"step": 1270
},
{
"epoch": 2.5806451612903225,
"grad_norm": 10.125198364257812,
"learning_rate": 5.301749633248531e-08,
"loss": 0.2255,
"step": 1280
},
{
"epoch": 2.600806451612903,
"grad_norm": 10.74553394317627,
"learning_rate": 4.814655722199096e-08,
"loss": 0.191,
"step": 1290
},
{
"epoch": 2.620967741935484,
"grad_norm": 12.781230926513672,
"learning_rate": 4.349897282756487e-08,
"loss": 0.2198,
"step": 1300
},
{
"epoch": 2.6411290322580645,
"grad_norm": 14.817058563232422,
"learning_rate": 3.90770404867829e-08,
"loss": 0.1992,
"step": 1310
},
{
"epoch": 2.661290322580645,
"grad_norm": 10.511248588562012,
"learning_rate": 3.4882945995633073e-08,
"loss": 0.2142,
"step": 1320
},
{
"epoch": 2.681451612903226,
"grad_norm": 12.40715503692627,
"learning_rate": 3.0918762528059805e-08,
"loss": 0.1963,
"step": 1330
},
{
"epoch": 2.7016129032258065,
"grad_norm": 14.832979202270508,
"learning_rate": 2.718644961117744e-08,
"loss": 0.2053,
"step": 1340
},
{
"epoch": 2.721774193548387,
"grad_norm": 12.077789306640625,
"learning_rate": 2.368785215666064e-08,
"loss": 0.1808,
"step": 1350
},
{
"epoch": 2.741935483870968,
"grad_norm": 12.798100471496582,
"learning_rate": 2.042469954879006e-08,
"loss": 0.2104,
"step": 1360
},
{
"epoch": 2.7620967741935485,
"grad_norm": 11.502704620361328,
"learning_rate": 1.7398604789604033e-08,
"loss": 0.2131,
"step": 1370
},
{
"epoch": 2.782258064516129,
"grad_norm": 13.160285949707031,
"learning_rate": 1.4611063701578886e-08,
"loss": 0.2023,
"step": 1380
},
{
"epoch": 2.8024193548387095,
"grad_norm": 14.090004920959473,
"learning_rate": 1.2063454188232348e-08,
"loss": 0.2045,
"step": 1390
},
{
"epoch": 2.8225806451612905,
"grad_norm": 11.760912895202637,
"learning_rate": 9.757035553014493e-09,
"loss": 0.2063,
"step": 1400
},
{
"epoch": 2.842741935483871,
"grad_norm": 13.928801536560059,
"learning_rate": 7.692947876824728e-09,
"loss": 0.1907,
"step": 1410
},
{
"epoch": 2.8629032258064515,
"grad_norm": 11.353324890136719,
"learning_rate": 5.872211454460596e-09,
"loss": 0.2084,
"step": 1420
},
{
"epoch": 2.883064516129032,
"grad_norm": 10.764039039611816,
"learning_rate": 4.295726290277579e-09,
"loss": 0.2057,
"step": 1430
},
{
"epoch": 2.903225806451613,
"grad_norm": 11.042613983154297,
"learning_rate": 2.964271653310646e-09,
"loss": 0.2123,
"step": 1440
},
{
"epoch": 2.9233870967741935,
"grad_norm": 15.109175682067871,
"learning_rate": 1.878505692074872e-09,
"loss": 0.1824,
"step": 1450
},
{
"epoch": 2.943548387096774,
"grad_norm": 12.680368423461914,
"learning_rate": 1.0389651092375662e-09,
"loss": 0.1996,
"step": 1460
},
{
"epoch": 2.963709677419355,
"grad_norm": 11.96649169921875,
"learning_rate": 4.4606489632198485e-10,
"loss": 0.2064,
"step": 1470
},
{
"epoch": 2.9838709677419355,
"grad_norm": 11.979409217834473,
"learning_rate": 1.0009812857370015e-10,
"loss": 0.1779,
"step": 1480
},
{
"epoch": 3.0,
"step": 1488,
"total_flos": 1.1291739690022994e+18,
"train_loss": 0.3318688049111315,
"train_runtime": 21295.3074,
"train_samples_per_second": 0.559,
"train_steps_per_second": 0.07
}
],
"logging_steps": 10,
"max_steps": 1488,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.1291739690022994e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}