{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9929896907216493, "eval_steps": 25, "global_step": 909, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.032989690721649485, "grad_norm": 0.27264630794525146, "learning_rate": 2.1978021978021977e-05, "loss": 1.1728, "step": 10 }, { "epoch": 0.06597938144329897, "grad_norm": 0.44495731592178345, "learning_rate": 4.3956043956043955e-05, "loss": 1.033, "step": 20 }, { "epoch": 0.08247422680412371, "eval_loss": 1.0129430294036865, "eval_runtime": 400.9284, "eval_samples_per_second": 10.753, "eval_steps_per_second": 0.673, "step": 25 }, { "epoch": 0.09896907216494845, "grad_norm": 0.2443210482597351, "learning_rate": 6.593406593406594e-05, "loss": 1.0181, "step": 30 }, { "epoch": 0.13195876288659794, "grad_norm": 0.1831207573413849, "learning_rate": 8.791208791208791e-05, "loss": 0.9783, "step": 40 }, { "epoch": 0.16494845360824742, "grad_norm": 0.2216944396495819, "learning_rate": 0.0001098901098901099, "loss": 0.9469, "step": 50 }, { "epoch": 0.16494845360824742, "eval_loss": 0.9523223042488098, "eval_runtime": 400.9411, "eval_samples_per_second": 10.752, "eval_steps_per_second": 0.673, "step": 50 }, { "epoch": 0.1979381443298969, "grad_norm": 0.2070694863796234, "learning_rate": 0.00013186813186813188, "loss": 0.9491, "step": 60 }, { "epoch": 0.2309278350515464, "grad_norm": 0.2137373685836792, "learning_rate": 0.00015384615384615385, "loss": 0.9185, "step": 70 }, { "epoch": 0.24742268041237114, "eval_loss": 0.9273656010627747, "eval_runtime": 400.9265, "eval_samples_per_second": 10.753, "eval_steps_per_second": 0.673, "step": 75 }, { "epoch": 0.2639175257731959, "grad_norm": 0.21778565645217896, "learning_rate": 0.00017582417582417582, "loss": 0.9184, "step": 80 }, { "epoch": 0.29690721649484536, "grad_norm": 0.23960737884044647, "learning_rate": 0.0001978021978021978, "loss": 0.9274, "step": 90 }, { "epoch": 0.32989690721649484, "grad_norm": 0.2075095921754837, "learning_rate": 0.0001999402682936637, "loss": 0.9374, "step": 100 }, { "epoch": 0.32989690721649484, "eval_loss": 0.9142278432846069, "eval_runtime": 400.9766, "eval_samples_per_second": 10.751, "eval_steps_per_second": 0.673, "step": 100 }, { "epoch": 0.3628865979381443, "grad_norm": 0.19346989691257477, "learning_rate": 0.00019973387993177673, "loss": 0.8995, "step": 110 }, { "epoch": 0.3958762886597938, "grad_norm": 0.41485053300857544, "learning_rate": 0.00019938040179954785, "loss": 0.9065, "step": 120 }, { "epoch": 0.41237113402061853, "eval_loss": 0.9051461815834045, "eval_runtime": 401.0005, "eval_samples_per_second": 10.751, "eval_steps_per_second": 0.673, "step": 125 }, { "epoch": 0.4288659793814433, "grad_norm": 0.19175270199775696, "learning_rate": 0.00019888035521448044, "loss": 0.9071, "step": 130 }, { "epoch": 0.4618556701030928, "grad_norm": 0.19583536684513092, "learning_rate": 0.00019823447765653023, "loss": 0.9121, "step": 140 }, { "epoch": 0.4948453608247423, "grad_norm": 0.20553036034107208, "learning_rate": 0.00019744372168045324, "loss": 0.904, "step": 150 }, { "epoch": 0.4948453608247423, "eval_loss": 0.8971042633056641, "eval_runtime": 400.9516, "eval_samples_per_second": 10.752, "eval_steps_per_second": 0.673, "step": 150 }, { "epoch": 0.5278350515463918, "grad_norm": 0.23569311201572418, "learning_rate": 0.0001965092535109567, "loss": 0.8863, "step": 160 }, { "epoch": 0.5608247422680412, "grad_norm": 0.19721971452236176, "learning_rate": 0.00019543245132272441, "loss": 0.8818, "step": 170 }, { "epoch": 0.5773195876288659, "eval_loss": 0.8927144408226013, "eval_runtime": 400.9553, "eval_samples_per_second": 10.752, "eval_steps_per_second": 0.673, "step": 175 }, { "epoch": 0.5938144329896907, "grad_norm": 0.22627437114715576, "learning_rate": 0.00019421490320785384, "loss": 0.8697, "step": 180 }, { "epoch": 0.6268041237113402, "grad_norm": 0.20646433532238007, "learning_rate": 0.0001928584048337022, "loss": 0.8874, "step": 190 }, { "epoch": 0.6597938144329897, "grad_norm": 0.22681070864200592, "learning_rate": 0.00019136495679459564, "loss": 0.8951, "step": 200 }, { "epoch": 0.6597938144329897, "eval_loss": 0.8895179629325867, "eval_runtime": 400.9416, "eval_samples_per_second": 10.752, "eval_steps_per_second": 0.673, "step": 200 }, { "epoch": 0.6927835051546392, "grad_norm": 0.19872383773326874, "learning_rate": 0.00018973676166130795, "loss": 0.8844, "step": 210 }, { "epoch": 0.7257731958762886, "grad_norm": 0.2135637402534485, "learning_rate": 0.00018797622073265946, "loss": 0.8829, "step": 220 }, { "epoch": 0.7422680412371134, "eval_loss": 0.8846537470817566, "eval_runtime": 400.9966, "eval_samples_per_second": 10.751, "eval_steps_per_second": 0.673, "step": 225 }, { "epoch": 0.7587628865979381, "grad_norm": 0.19577114284038544, "learning_rate": 0.00018608593049402754, "loss": 0.9059, "step": 230 }, { "epoch": 0.7917525773195876, "grad_norm": 0.20382213592529297, "learning_rate": 0.00018406867878799154, "loss": 0.8655, "step": 240 }, { "epoch": 0.8247422680412371, "grad_norm": 0.19666744768619537, "learning_rate": 0.0001819274407027599, "loss": 0.8963, "step": 250 }, { "epoch": 0.8247422680412371, "eval_loss": 0.8801769018173218, "eval_runtime": 400.9614, "eval_samples_per_second": 10.752, "eval_steps_per_second": 0.673, "step": 250 }, { "epoch": 0.8577319587628865, "grad_norm": 0.20239554345607758, "learning_rate": 0.000179665374184443, "loss": 0.877, "step": 260 }, { "epoch": 0.8907216494845361, "grad_norm": 0.21307946741580963, "learning_rate": 0.0001772858153796432, "loss": 0.8742, "step": 270 }, { "epoch": 0.9072164948453608, "eval_loss": 0.8792086839675903, "eval_runtime": 400.9808, "eval_samples_per_second": 10.751, "eval_steps_per_second": 0.673, "step": 275 }, { "epoch": 0.9237113402061856, "grad_norm": 0.21087533235549927, "learning_rate": 0.00017479227371523082, "loss": 0.8661, "step": 280 }, { "epoch": 0.9567010309278351, "grad_norm": 0.22192323207855225, "learning_rate": 0.0001721884267225624, "loss": 0.8744, "step": 290 }, { "epoch": 0.9896907216494846, "grad_norm": 0.20492371916770935, "learning_rate": 0.00016947811461377467, "loss": 0.9018, "step": 300 }, { "epoch": 0.9896907216494846, "eval_loss": 0.8737736344337463, "eval_runtime": 400.9528, "eval_samples_per_second": 10.752, "eval_steps_per_second": 0.673, "step": 300 }, { "epoch": 1.0197938144329897, "grad_norm": 0.21177279949188232, "learning_rate": 0.00016666533461815326, "loss": 0.8363, "step": 310 }, { "epoch": 1.0527835051546393, "grad_norm": 0.2528247535228729, "learning_rate": 0.00016375423508692912, "loss": 0.8414, "step": 320 }, { "epoch": 1.069278350515464, "eval_loss": 0.8753945231437683, "eval_runtime": 400.9781, "eval_samples_per_second": 10.751, "eval_steps_per_second": 0.673, "step": 325 }, { "epoch": 1.0857731958762886, "grad_norm": 0.21822592616081238, "learning_rate": 0.00016074910937519663, "loss": 0.8745, "step": 330 }, { "epoch": 1.1187628865979382, "grad_norm": 0.21870850026607513, "learning_rate": 0.00015765438950997705, "loss": 0.8226, "step": 340 }, { "epoch": 1.1517525773195876, "grad_norm": 0.20419549942016602, "learning_rate": 0.0001544746396537651, "loss": 0.8508, "step": 350 }, { "epoch": 1.1517525773195876, "eval_loss": 0.8740637898445129, "eval_runtime": 401.0482, "eval_samples_per_second": 10.749, "eval_steps_per_second": 0.673, "step": 350 }, { "epoch": 1.1847422680412372, "grad_norm": 0.2176382690668106, "learning_rate": 0.00015121454937319976, "loss": 0.8092, "step": 360 }, { "epoch": 1.2177319587628865, "grad_norm": 0.26385313272476196, "learning_rate": 0.00014787892672278556, "loss": 0.8263, "step": 370 }, { "epoch": 1.2342268041237112, "eval_loss": 0.8707788586616516, "eval_runtime": 401.0479, "eval_samples_per_second": 10.749, "eval_steps_per_second": 0.673, "step": 375 }, { "epoch": 1.2507216494845361, "grad_norm": 0.21340814232826233, "learning_rate": 0.00014447269115386573, "loss": 0.8175, "step": 380 }, { "epoch": 1.2837113402061855, "grad_norm": 0.23228245973587036, "learning_rate": 0.00014100086625930464, "loss": 0.8424, "step": 390 }, { "epoch": 1.316701030927835, "grad_norm": 0.23154015839099884, "learning_rate": 0.00013746857236458007, "loss": 0.8444, "step": 400 }, { "epoch": 1.316701030927835, "eval_loss": 0.8696116209030151, "eval_runtime": 401.028, "eval_samples_per_second": 10.75, "eval_steps_per_second": 0.673, "step": 400 }, { "epoch": 1.3496907216494844, "grad_norm": 0.22975751757621765, "learning_rate": 0.00013388101897621182, "loss": 0.8343, "step": 410 }, { "epoch": 1.382680412371134, "grad_norm": 0.31299638748168945, "learning_rate": 0.00013024349709866447, "loss": 0.8395, "step": 420 }, { "epoch": 1.3991752577319587, "eval_loss": 0.8670699000358582, "eval_runtime": 401.0553, "eval_samples_per_second": 10.749, "eval_steps_per_second": 0.673, "step": 425 }, { "epoch": 1.4156701030927836, "grad_norm": 0.2256530523300171, "learning_rate": 0.00012656137143105483, "loss": 0.836, "step": 430 }, { "epoch": 1.448659793814433, "grad_norm": 0.37599867582321167, "learning_rate": 0.0001228400724551728, "loss": 0.7908, "step": 440 }, { "epoch": 1.4816494845360824, "grad_norm": 0.23097045719623566, "learning_rate": 0.00011908508842648506, "loss": 0.84, "step": 450 }, { "epoch": 1.4816494845360824, "eval_loss": 0.8646743893623352, "eval_runtime": 401.0395, "eval_samples_per_second": 10.75, "eval_steps_per_second": 0.673, "step": 450 }, { "epoch": 1.514639175257732, "grad_norm": 0.23050455749034882, "learning_rate": 0.000115301957279932, "loss": 0.8314, "step": 460 }, { "epoch": 1.5476288659793815, "grad_norm": 0.23194897174835205, "learning_rate": 0.00011149625846245682, "loss": 0.8168, "step": 470 }, { "epoch": 1.5641237113402062, "eval_loss": 0.8628771901130676, "eval_runtime": 401.038, "eval_samples_per_second": 10.75, "eval_steps_per_second": 0.673, "step": 475 }, { "epoch": 1.580618556701031, "grad_norm": 0.23772668838500977, "learning_rate": 0.00010767360470431158, "loss": 0.803, "step": 480 }, { "epoch": 1.6136082474226803, "grad_norm": 0.40852147340774536, "learning_rate": 0.00010383963374127645, "loss": 0.8232, "step": 490 }, { "epoch": 1.6465979381443299, "grad_norm": 0.2158336341381073, "learning_rate": 0.0001, "loss": 0.837, "step": 500 }, { "epoch": 1.6465979381443299, "eval_loss": 0.8609479069709778, "eval_runtime": 401.0527, "eval_samples_per_second": 10.749, "eval_steps_per_second": 0.673, "step": 500 }, { "epoch": 1.6795876288659795, "grad_norm": 0.2508278787136078, "learning_rate": 9.616036625872357e-05, "loss": 0.8143, "step": 510 }, { "epoch": 1.7125773195876288, "grad_norm": 0.22889100015163422, "learning_rate": 9.232639529568843e-05, "loss": 0.8395, "step": 520 }, { "epoch": 1.7290721649484535, "eval_loss": 0.8594603538513184, "eval_runtime": 401.0524, "eval_samples_per_second": 10.749, "eval_steps_per_second": 0.673, "step": 525 }, { "epoch": 1.7455670103092782, "grad_norm": 0.2332814484834671, "learning_rate": 8.850374153754322e-05, "loss": 0.7963, "step": 530 }, { "epoch": 1.778556701030928, "grad_norm": 0.2086717039346695, "learning_rate": 8.469804272006801e-05, "loss": 0.7959, "step": 540 }, { "epoch": 1.8115463917525774, "grad_norm": 0.2422792613506317, "learning_rate": 8.091491157351495e-05, "loss": 0.807, "step": 550 }, { "epoch": 1.8115463917525774, "eval_loss": 0.8581506609916687, "eval_runtime": 401.0915, "eval_samples_per_second": 10.748, "eval_steps_per_second": 0.673, "step": 550 }, { "epoch": 1.8445360824742267, "grad_norm": 0.2212693840265274, "learning_rate": 7.71599275448272e-05, "loss": 0.8461, "step": 560 }, { "epoch": 1.8775257731958763, "grad_norm": 0.22461137175559998, "learning_rate": 7.343862856894521e-05, "loss": 0.841, "step": 570 }, { "epoch": 1.894020618556701, "eval_loss": 0.8570997714996338, "eval_runtime": 401.2014, "eval_samples_per_second": 10.745, "eval_steps_per_second": 0.673, "step": 575 }, { "epoch": 1.910515463917526, "grad_norm": 0.22947990894317627, "learning_rate": 6.975650290133554e-05, "loss": 0.8041, "step": 580 }, { "epoch": 1.9435051546391753, "grad_norm": 0.23240363597869873, "learning_rate": 6.611898102378818e-05, "loss": 0.8279, "step": 590 }, { "epoch": 1.9764948453608246, "grad_norm": 0.23037049174308777, "learning_rate": 6.253142763541996e-05, "loss": 0.8257, "step": 600 }, { "epoch": 1.9764948453608246, "eval_loss": 0.8549010157585144, "eval_runtime": 401.148, "eval_samples_per_second": 10.747, "eval_steps_per_second": 0.673, "step": 600 }, { "epoch": 2.00659793814433, "grad_norm": 0.2333364486694336, "learning_rate": 5.899913374069539e-05, "loss": 0.7969, "step": 610 }, { "epoch": 2.0395876288659793, "grad_norm": 0.24773956835269928, "learning_rate": 5.552730884613429e-05, "loss": 0.7783, "step": 620 }, { "epoch": 2.056082474226804, "eval_loss": 0.8614206910133362, "eval_runtime": 401.0858, "eval_samples_per_second": 10.748, "eval_steps_per_second": 0.673, "step": 625 }, { "epoch": 2.0725773195876287, "grad_norm": 0.24158383905887604, "learning_rate": 5.212107327721445e-05, "loss": 0.7527, "step": 630 }, { "epoch": 2.1055670103092785, "grad_norm": 0.2403087019920349, "learning_rate": 4.878545062680025e-05, "loss": 0.7648, "step": 640 }, { "epoch": 2.138556701030928, "grad_norm": 0.23482970893383026, "learning_rate": 4.5525360346234916e-05, "loss": 0.7875, "step": 650 }, { "epoch": 2.138556701030928, "eval_loss": 0.8609281182289124, "eval_runtime": 401.0846, "eval_samples_per_second": 10.748, "eval_steps_per_second": 0.673, "step": 650 }, { "epoch": 2.1715463917525772, "grad_norm": 0.36150163412094116, "learning_rate": 4.2345610490023004e-05, "loss": 0.7731, "step": 660 }, { "epoch": 2.2045360824742266, "grad_norm": 0.260690838098526, "learning_rate": 3.925089062480339e-05, "loss": 0.7808, "step": 670 }, { "epoch": 2.2210309278350517, "eval_loss": 0.8612596988677979, "eval_runtime": 401.1131, "eval_samples_per_second": 10.748, "eval_steps_per_second": 0.673, "step": 675 }, { "epoch": 2.2375257731958764, "grad_norm": 0.25605785846710205, "learning_rate": 3.624576491307088e-05, "loss": 0.7761, "step": 680 }, { "epoch": 2.270515463917526, "grad_norm": 0.29545921087265015, "learning_rate": 3.3334665381846744e-05, "loss": 0.7496, "step": 690 }, { "epoch": 2.303505154639175, "grad_norm": 0.26408135890960693, "learning_rate": 3.052188538622535e-05, "loss": 0.7521, "step": 700 }, { "epoch": 2.303505154639175, "eval_loss": 0.8600190281867981, "eval_runtime": 401.1633, "eval_samples_per_second": 10.746, "eval_steps_per_second": 0.673, "step": 700 }, { "epoch": 2.336494845360825, "grad_norm": 0.2684233784675598, "learning_rate": 2.7811573277437608e-05, "loss": 0.7634, "step": 710 }, { "epoch": 2.3694845360824743, "grad_norm": 0.2764691412448883, "learning_rate": 2.5207726284769194e-05, "loss": 0.7599, "step": 720 }, { "epoch": 2.385979381443299, "eval_loss": 0.8599956631660461, "eval_runtime": 401.2048, "eval_samples_per_second": 10.745, "eval_steps_per_second": 0.673, "step": 725 }, { "epoch": 2.4024742268041237, "grad_norm": 0.26828116178512573, "learning_rate": 2.2714184620356827e-05, "loss": 0.762, "step": 730 }, { "epoch": 2.435463917525773, "grad_norm": 0.2688361406326294, "learning_rate": 2.033462581555703e-05, "loss": 0.7614, "step": 740 }, { "epoch": 2.4684536082474224, "grad_norm": 0.277687132358551, "learning_rate": 1.80725592972401e-05, "loss": 0.7771, "step": 750 }, { "epoch": 2.4684536082474224, "eval_loss": 0.8589318990707397, "eval_runtime": 401.253, "eval_samples_per_second": 10.744, "eval_steps_per_second": 0.673, "step": 750 }, { "epoch": 2.5014432989690722, "grad_norm": 0.26160988211631775, "learning_rate": 1.5931321212008466e-05, "loss": 0.7781, "step": 760 }, { "epoch": 2.5344329896907216, "grad_norm": 0.27128005027770996, "learning_rate": 1.3914069505972483e-05, "loss": 0.7535, "step": 770 }, { "epoch": 2.5509278350515463, "eval_loss": 0.8590140342712402, "eval_runtime": 401.205, "eval_samples_per_second": 10.745, "eval_steps_per_second": 0.673, "step": 775 }, { "epoch": 2.567422680412371, "grad_norm": 0.2620924115180969, "learning_rate": 1.2023779267340562e-05, "loss": 0.7441, "step": 780 }, { "epoch": 2.600412371134021, "grad_norm": 0.26399528980255127, "learning_rate": 1.0263238338692061e-05, "loss": 0.7709, "step": 790 }, { "epoch": 2.63340206185567, "grad_norm": 0.2564082443714142, "learning_rate": 8.63504320540438e-06, "loss": 0.7785, "step": 800 }, { "epoch": 2.63340206185567, "eval_loss": 0.8589113354682922, "eval_runtime": 401.2229, "eval_samples_per_second": 10.745, "eval_steps_per_second": 0.673, "step": 800 }, { "epoch": 2.6663917525773195, "grad_norm": 0.2638494372367859, "learning_rate": 7.141595166297832e-06, "loss": 0.7764, "step": 810 }, { "epoch": 2.699381443298969, "grad_norm": 0.26147758960723877, "learning_rate": 5.785096792146161e-06, "loss": 0.759, "step": 820 }, { "epoch": 2.7158762886597936, "eval_loss": 0.8589122891426086, "eval_runtime": 401.2905, "eval_samples_per_second": 10.743, "eval_steps_per_second": 0.673, "step": 825 }, { "epoch": 2.7323711340206187, "grad_norm": 0.2687396705150604, "learning_rate": 4.567548677275602e-06, "loss": 0.7602, "step": 830 }, { "epoch": 2.765360824742268, "grad_norm": 0.295311838388443, "learning_rate": 3.490746489043317e-06, "loss": 0.7756, "step": 840 }, { "epoch": 2.7983505154639174, "grad_norm": 0.2631383240222931, "learning_rate": 2.5562783195467676e-06, "loss": 0.7718, "step": 850 }, { "epoch": 2.7983505154639174, "eval_loss": 0.8586205244064331, "eval_runtime": 401.1315, "eval_samples_per_second": 10.747, "eval_steps_per_second": 0.673, "step": 850 }, { "epoch": 2.8313402061855673, "grad_norm": 0.2887561023235321, "learning_rate": 1.7655223434698055e-06, "loss": 0.7855, "step": 860 }, { "epoch": 2.8643298969072166, "grad_norm": 0.27454614639282227, "learning_rate": 1.1196447855195802e-06, "loss": 0.7565, "step": 870 }, { "epoch": 2.8808247422680413, "eval_loss": 0.8585217595100403, "eval_runtime": 401.2068, "eval_samples_per_second": 10.745, "eval_steps_per_second": 0.673, "step": 875 }, { "epoch": 2.897319587628866, "grad_norm": 0.26877835392951965, "learning_rate": 6.195982004521539e-07, "loss": 0.7513, "step": 880 }, { "epoch": 2.9303092783505154, "grad_norm": 0.2676011025905609, "learning_rate": 2.6612006822327454e-07, "loss": 0.7437, "step": 890 }, { "epoch": 2.9632989690721647, "grad_norm": 0.29583173990249634, "learning_rate": 5.973170633631897e-08, "loss": 0.7492, "step": 900 }, { "epoch": 2.9632989690721647, "eval_loss": 0.858492374420166, "eval_runtime": 401.1888, "eval_samples_per_second": 10.746, "eval_steps_per_second": 0.673, "step": 900 }, { "epoch": 2.9929896907216493, "step": 909, "total_flos": 8.709242512830628e+18, "train_loss": 0.8364332821240651, "train_runtime": 42787.2107, "train_samples_per_second": 2.72, "train_steps_per_second": 0.021 } ], "logging_steps": 10, "max_steps": 909, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.709242512830628e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }