diff --git "a/checkpoint-15581/trainer_state.json" "b/checkpoint-15581/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-15581/trainer_state.json" @@ -0,0 +1,10940 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 15581, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000641843374170618, + "grad_norm": 2.778578758239746, + "learning_rate": 3.6e-07, + "loss": 2.0918, + "step": 10 + }, + { + "epoch": 0.001283686748341236, + "grad_norm": 3.490725517272949, + "learning_rate": 7.6e-07, + "loss": 2.0914, + "step": 20 + }, + { + "epoch": 0.0019255301225118541, + "grad_norm": 2.9524786472320557, + "learning_rate": 1.1600000000000001e-06, + "loss": 2.0855, + "step": 30 + }, + { + "epoch": 0.002567373496682472, + "grad_norm": 2.4857563972473145, + "learning_rate": 1.56e-06, + "loss": 2.0858, + "step": 40 + }, + { + "epoch": 0.00320921687085309, + "grad_norm": 4.684011459350586, + "learning_rate": 1.9600000000000003e-06, + "loss": 2.0894, + "step": 50 + }, + { + "epoch": 0.0038510602450237083, + "grad_norm": 2.8892273902893066, + "learning_rate": 2.3600000000000003e-06, + "loss": 2.0796, + "step": 60 + }, + { + "epoch": 0.004492903619194326, + "grad_norm": 2.8607919216156006, + "learning_rate": 2.7600000000000003e-06, + "loss": 2.0779, + "step": 70 + }, + { + "epoch": 0.005134746993364944, + "grad_norm": 3.9530575275421143, + "learning_rate": 3.1600000000000002e-06, + "loss": 2.0703, + "step": 80 + }, + { + "epoch": 0.005776590367535562, + "grad_norm": 2.9849867820739746, + "learning_rate": 3.5600000000000002e-06, + "loss": 2.0624, + "step": 90 + }, + { + "epoch": 0.00641843374170618, + "grad_norm": 2.684105634689331, + "learning_rate": 3.96e-06, + "loss": 2.0596, + "step": 100 + }, + { + "epoch": 0.007060277115876798, + "grad_norm": 3.2845048904418945, + "learning_rate": 4.360000000000001e-06, + "loss": 2.0403, + "step": 110 + }, + { + "epoch": 0.0077021204900474165, + "grad_norm": 3.709146022796631, + "learning_rate": 4.76e-06, + "loss": 1.9946, + "step": 120 + }, + { + "epoch": 0.008343963864218035, + "grad_norm": 5.354912757873535, + "learning_rate": 5.1600000000000006e-06, + "loss": 1.9471, + "step": 130 + }, + { + "epoch": 0.008985807238388653, + "grad_norm": 5.101510524749756, + "learning_rate": 5.560000000000001e-06, + "loss": 1.8538, + "step": 140 + }, + { + "epoch": 0.00962765061255927, + "grad_norm": Infinity, + "learning_rate": 5.9600000000000005e-06, + "loss": 1.7965, + "step": 150 + }, + { + "epoch": 0.010269493986729888, + "grad_norm": 9.85523509979248, + "learning_rate": 6.360000000000001e-06, + "loss": 1.7491, + "step": 160 + }, + { + "epoch": 0.010911337360900506, + "grad_norm": 13.500116348266602, + "learning_rate": 6.760000000000001e-06, + "loss": 1.6258, + "step": 170 + }, + { + "epoch": 0.011553180735071123, + "grad_norm": 12.905332565307617, + "learning_rate": 7.16e-06, + "loss": 1.6156, + "step": 180 + }, + { + "epoch": 0.012195024109241743, + "grad_norm": 5.522826194763184, + "learning_rate": 7.5600000000000005e-06, + "loss": 1.6384, + "step": 190 + }, + { + "epoch": 0.01283686748341236, + "grad_norm": 9.346458435058594, + "learning_rate": 7.960000000000002e-06, + "loss": 1.5583, + "step": 200 + }, + { + "epoch": 0.013478710857582978, + "grad_norm": 10.575640678405762, + "learning_rate": 8.36e-06, + "loss": 1.4437, + "step": 210 + }, + { + "epoch": 0.014120554231753596, + "grad_norm": 20.125003814697266, + "learning_rate": 8.76e-06, + "loss": 1.4157, + "step": 220 + }, + { + "epoch": 0.014762397605924214, + "grad_norm": 12.119187355041504, + "learning_rate": 9.16e-06, + "loss": 1.3255, + "step": 230 + }, + { + "epoch": 0.015404240980094833, + "grad_norm": 15.212540626525879, + "learning_rate": 9.56e-06, + "loss": 1.2985, + "step": 240 + }, + { + "epoch": 0.01604608435426545, + "grad_norm": 13.789387702941895, + "learning_rate": 9.960000000000001e-06, + "loss": 1.1824, + "step": 250 + }, + { + "epoch": 0.01668792772843607, + "grad_norm": 17.67810821533203, + "learning_rate": 1.036e-05, + "loss": 1.1609, + "step": 260 + }, + { + "epoch": 0.017329771102606688, + "grad_norm": 13.683328628540039, + "learning_rate": 1.0760000000000002e-05, + "loss": 1.0169, + "step": 270 + }, + { + "epoch": 0.017971614476777305, + "grad_norm": 13.746779441833496, + "learning_rate": 1.1160000000000002e-05, + "loss": 0.9953, + "step": 280 + }, + { + "epoch": 0.018613457850947923, + "grad_norm": 17.402944564819336, + "learning_rate": 1.156e-05, + "loss": 0.8542, + "step": 290 + }, + { + "epoch": 0.01925530122511854, + "grad_norm": 18.10095977783203, + "learning_rate": 1.196e-05, + "loss": 0.807, + "step": 300 + }, + { + "epoch": 0.01989714459928916, + "grad_norm": 17.38810920715332, + "learning_rate": 1.236e-05, + "loss": 0.7492, + "step": 310 + }, + { + "epoch": 0.020538987973459776, + "grad_norm": 16.033836364746094, + "learning_rate": 1.2760000000000001e-05, + "loss": 0.7984, + "step": 320 + }, + { + "epoch": 0.021180831347630394, + "grad_norm": 18.93714141845703, + "learning_rate": 1.3160000000000001e-05, + "loss": 0.632, + "step": 330 + }, + { + "epoch": 0.02182267472180101, + "grad_norm": 17.007225036621094, + "learning_rate": 1.3560000000000002e-05, + "loss": 0.6511, + "step": 340 + }, + { + "epoch": 0.02246451809597163, + "grad_norm": 15.338671684265137, + "learning_rate": 1.396e-05, + "loss": 0.6407, + "step": 350 + }, + { + "epoch": 0.023106361470142247, + "grad_norm": 16.871875762939453, + "learning_rate": 1.4360000000000001e-05, + "loss": 0.5841, + "step": 360 + }, + { + "epoch": 0.023748204844312868, + "grad_norm": 12.31517505645752, + "learning_rate": 1.4760000000000001e-05, + "loss": 0.5787, + "step": 370 + }, + { + "epoch": 0.024390048218483486, + "grad_norm": 12.55438232421875, + "learning_rate": 1.516e-05, + "loss": 0.4979, + "step": 380 + }, + { + "epoch": 0.025031891592654103, + "grad_norm": 11.683804512023926, + "learning_rate": 1.556e-05, + "loss": 0.409, + "step": 390 + }, + { + "epoch": 0.02567373496682472, + "grad_norm": 13.466601371765137, + "learning_rate": 1.5960000000000003e-05, + "loss": 0.4344, + "step": 400 + }, + { + "epoch": 0.02631557834099534, + "grad_norm": 13.933389663696289, + "learning_rate": 1.636e-05, + "loss": 0.4355, + "step": 410 + }, + { + "epoch": 0.026957421715165956, + "grad_norm": 14.962430953979492, + "learning_rate": 1.6760000000000002e-05, + "loss": 0.4358, + "step": 420 + }, + { + "epoch": 0.027599265089336574, + "grad_norm": 11.194101333618164, + "learning_rate": 1.7160000000000002e-05, + "loss": 0.3418, + "step": 430 + }, + { + "epoch": 0.028241108463507192, + "grad_norm": 10.003135681152344, + "learning_rate": 1.756e-05, + "loss": 0.407, + "step": 440 + }, + { + "epoch": 0.02888295183767781, + "grad_norm": 13.722594261169434, + "learning_rate": 1.796e-05, + "loss": 0.3784, + "step": 450 + }, + { + "epoch": 0.029524795211848427, + "grad_norm": 11.653929710388184, + "learning_rate": 1.8360000000000004e-05, + "loss": 0.3438, + "step": 460 + }, + { + "epoch": 0.03016663858601905, + "grad_norm": 10.154895782470703, + "learning_rate": 1.876e-05, + "loss": 0.3102, + "step": 470 + }, + { + "epoch": 0.030808481960189666, + "grad_norm": 13.267715454101562, + "learning_rate": 1.916e-05, + "loss": 0.3173, + "step": 480 + }, + { + "epoch": 0.031450325334360284, + "grad_norm": 9.964168548583984, + "learning_rate": 1.9560000000000002e-05, + "loss": 0.3096, + "step": 490 + }, + { + "epoch": 0.0320921687085309, + "grad_norm": 13.796014785766602, + "learning_rate": 1.9960000000000002e-05, + "loss": 0.345, + "step": 500 + }, + { + "epoch": 0.03273401208270152, + "grad_norm": 10.034880638122559, + "learning_rate": 1.998806445195942e-05, + "loss": 0.3417, + "step": 510 + }, + { + "epoch": 0.03337585545687214, + "grad_norm": 9.3333740234375, + "learning_rate": 1.9974802731914332e-05, + "loss": 0.2927, + "step": 520 + }, + { + "epoch": 0.034017698831042754, + "grad_norm": 9.258805274963379, + "learning_rate": 1.996154101186924e-05, + "loss": 0.2624, + "step": 530 + }, + { + "epoch": 0.034659542205213376, + "grad_norm": 6.755455017089844, + "learning_rate": 1.994827929182415e-05, + "loss": 0.2602, + "step": 540 + }, + { + "epoch": 0.03530138557938399, + "grad_norm": 14.036195755004883, + "learning_rate": 1.9935017571779062e-05, + "loss": 0.3068, + "step": 550 + }, + { + "epoch": 0.03594322895355461, + "grad_norm": 7.500053405761719, + "learning_rate": 1.992175585173397e-05, + "loss": 0.2405, + "step": 560 + }, + { + "epoch": 0.036585072327725225, + "grad_norm": 10.683878898620605, + "learning_rate": 1.990849413168888e-05, + "loss": 0.2651, + "step": 570 + }, + { + "epoch": 0.037226915701895846, + "grad_norm": 9.001018524169922, + "learning_rate": 1.9895232411643792e-05, + "loss": 0.3062, + "step": 580 + }, + { + "epoch": 0.03786875907606646, + "grad_norm": 7.354406833648682, + "learning_rate": 1.9881970691598703e-05, + "loss": 0.2741, + "step": 590 + }, + { + "epoch": 0.03851060245023708, + "grad_norm": 7.920845031738281, + "learning_rate": 1.986870897155361e-05, + "loss": 0.2483, + "step": 600 + }, + { + "epoch": 0.039152445824407696, + "grad_norm": 11.448545455932617, + "learning_rate": 1.9855447251508522e-05, + "loss": 0.1963, + "step": 610 + }, + { + "epoch": 0.03979428919857832, + "grad_norm": 8.437763214111328, + "learning_rate": 1.984218553146343e-05, + "loss": 0.2262, + "step": 620 + }, + { + "epoch": 0.04043613257274894, + "grad_norm": 9.328960418701172, + "learning_rate": 1.9828923811418345e-05, + "loss": 0.1793, + "step": 630 + }, + { + "epoch": 0.04107797594691955, + "grad_norm": 7.899652481079102, + "learning_rate": 1.9815662091373252e-05, + "loss": 0.2089, + "step": 640 + }, + { + "epoch": 0.041719819321090174, + "grad_norm": 8.917160987854004, + "learning_rate": 1.9802400371328164e-05, + "loss": 0.2036, + "step": 650 + }, + { + "epoch": 0.04236166269526079, + "grad_norm": 8.51976490020752, + "learning_rate": 1.978913865128307e-05, + "loss": 0.2127, + "step": 660 + }, + { + "epoch": 0.04300350606943141, + "grad_norm": 8.080279350280762, + "learning_rate": 1.9775876931237982e-05, + "loss": 0.2343, + "step": 670 + }, + { + "epoch": 0.04364534944360202, + "grad_norm": 8.942863464355469, + "learning_rate": 1.9762615211192894e-05, + "loss": 0.1784, + "step": 680 + }, + { + "epoch": 0.044287192817772644, + "grad_norm": 8.20798397064209, + "learning_rate": 1.97493534911478e-05, + "loss": 0.149, + "step": 690 + }, + { + "epoch": 0.04492903619194326, + "grad_norm": 7.868164539337158, + "learning_rate": 1.9736091771102716e-05, + "loss": 0.1537, + "step": 700 + }, + { + "epoch": 0.04557087956611388, + "grad_norm": 8.475685119628906, + "learning_rate": 1.9722830051057624e-05, + "loss": 0.1234, + "step": 710 + }, + { + "epoch": 0.046212722940284494, + "grad_norm": 13.600449562072754, + "learning_rate": 1.9709568331012535e-05, + "loss": 0.1871, + "step": 720 + }, + { + "epoch": 0.046854566314455115, + "grad_norm": 7.583765029907227, + "learning_rate": 1.9696306610967443e-05, + "loss": 0.1772, + "step": 730 + }, + { + "epoch": 0.047496409688625736, + "grad_norm": 7.121690273284912, + "learning_rate": 1.9683044890922354e-05, + "loss": 0.196, + "step": 740 + }, + { + "epoch": 0.04813825306279635, + "grad_norm": 7.272078514099121, + "learning_rate": 1.9669783170877265e-05, + "loss": 0.1647, + "step": 750 + }, + { + "epoch": 0.04878009643696697, + "grad_norm": 8.071809768676758, + "learning_rate": 1.9656521450832176e-05, + "loss": 0.1741, + "step": 760 + }, + { + "epoch": 0.049421939811137586, + "grad_norm": 8.809086799621582, + "learning_rate": 1.9643259730787084e-05, + "loss": 0.1501, + "step": 770 + }, + { + "epoch": 0.05006378318530821, + "grad_norm": 7.277254581451416, + "learning_rate": 1.9629998010741995e-05, + "loss": 0.1812, + "step": 780 + }, + { + "epoch": 0.05070562655947882, + "grad_norm": 6.261189937591553, + "learning_rate": 1.9616736290696906e-05, + "loss": 0.1599, + "step": 790 + }, + { + "epoch": 0.05134746993364944, + "grad_norm": 5.358989715576172, + "learning_rate": 1.9603474570651814e-05, + "loss": 0.1563, + "step": 800 + }, + { + "epoch": 0.051989313307820056, + "grad_norm": 5.942981719970703, + "learning_rate": 1.9590212850606725e-05, + "loss": 0.1131, + "step": 810 + }, + { + "epoch": 0.05263115668199068, + "grad_norm": 11.299034118652344, + "learning_rate": 1.9576951130561633e-05, + "loss": 0.1872, + "step": 820 + }, + { + "epoch": 0.05327300005616129, + "grad_norm": 3.3547017574310303, + "learning_rate": 1.9563689410516548e-05, + "loss": 0.1696, + "step": 830 + }, + { + "epoch": 0.05391484343033191, + "grad_norm": 7.165664196014404, + "learning_rate": 1.9550427690471455e-05, + "loss": 0.1668, + "step": 840 + }, + { + "epoch": 0.054556686804502534, + "grad_norm": 9.479928970336914, + "learning_rate": 1.9537165970426367e-05, + "loss": 0.1488, + "step": 850 + }, + { + "epoch": 0.05519853017867315, + "grad_norm": 5.311110019683838, + "learning_rate": 1.9523904250381274e-05, + "loss": 0.1503, + "step": 860 + }, + { + "epoch": 0.05584037355284377, + "grad_norm": 9.265485763549805, + "learning_rate": 1.9510642530336186e-05, + "loss": 0.1382, + "step": 870 + }, + { + "epoch": 0.056482216927014384, + "grad_norm": 11.654985427856445, + "learning_rate": 1.9497380810291097e-05, + "loss": 0.1764, + "step": 880 + }, + { + "epoch": 0.057124060301185005, + "grad_norm": 5.929287433624268, + "learning_rate": 1.9484119090246008e-05, + "loss": 0.1459, + "step": 890 + }, + { + "epoch": 0.05776590367535562, + "grad_norm": 12.011093139648438, + "learning_rate": 1.9470857370200916e-05, + "loss": 0.1202, + "step": 900 + }, + { + "epoch": 0.05840774704952624, + "grad_norm": 5.606518745422363, + "learning_rate": 1.9457595650155827e-05, + "loss": 0.1153, + "step": 910 + }, + { + "epoch": 0.059049590423696854, + "grad_norm": 5.869558811187744, + "learning_rate": 1.9444333930110738e-05, + "loss": 0.1428, + "step": 920 + }, + { + "epoch": 0.059691433797867476, + "grad_norm": 9.52774715423584, + "learning_rate": 1.9431072210065646e-05, + "loss": 0.1347, + "step": 930 + }, + { + "epoch": 0.0603332771720381, + "grad_norm": 8.32546329498291, + "learning_rate": 1.9417810490020557e-05, + "loss": 0.153, + "step": 940 + }, + { + "epoch": 0.06097512054620871, + "grad_norm": 9.355123519897461, + "learning_rate": 1.9404548769975465e-05, + "loss": 0.123, + "step": 950 + }, + { + "epoch": 0.06161696392037933, + "grad_norm": 6.419095039367676, + "learning_rate": 1.939128704993038e-05, + "loss": 0.1075, + "step": 960 + }, + { + "epoch": 0.062258807294549946, + "grad_norm": 8.51015567779541, + "learning_rate": 1.9378025329885287e-05, + "loss": 0.1322, + "step": 970 + }, + { + "epoch": 0.06290065066872057, + "grad_norm": 6.859710216522217, + "learning_rate": 1.93647636098402e-05, + "loss": 0.1363, + "step": 980 + }, + { + "epoch": 0.06354249404289118, + "grad_norm": 6.191632270812988, + "learning_rate": 1.935150188979511e-05, + "loss": 0.1287, + "step": 990 + }, + { + "epoch": 0.0641843374170618, + "grad_norm": 2.526578903198242, + "learning_rate": 1.9338240169750017e-05, + "loss": 0.1466, + "step": 1000 + }, + { + "epoch": 0.06482618079123242, + "grad_norm": 7.311110019683838, + "learning_rate": 1.932497844970493e-05, + "loss": 0.1341, + "step": 1010 + }, + { + "epoch": 0.06546802416540304, + "grad_norm": 1.3056310415267944, + "learning_rate": 1.931171672965984e-05, + "loss": 0.1211, + "step": 1020 + }, + { + "epoch": 0.06610986753957365, + "grad_norm": 9.120423316955566, + "learning_rate": 1.929845500961475e-05, + "loss": 0.1268, + "step": 1030 + }, + { + "epoch": 0.06675171091374428, + "grad_norm": 2.8570845127105713, + "learning_rate": 1.928519328956966e-05, + "loss": 0.0942, + "step": 1040 + }, + { + "epoch": 0.0673935542879149, + "grad_norm": 9.288402557373047, + "learning_rate": 1.927193156952457e-05, + "loss": 0.147, + "step": 1050 + }, + { + "epoch": 0.06803539766208551, + "grad_norm": 6.629462242126465, + "learning_rate": 1.9258669849479478e-05, + "loss": 0.1461, + "step": 1060 + }, + { + "epoch": 0.06867724103625612, + "grad_norm": 3.9713406562805176, + "learning_rate": 1.924540812943439e-05, + "loss": 0.117, + "step": 1070 + }, + { + "epoch": 0.06931908441042675, + "grad_norm": 8.020031929016113, + "learning_rate": 1.92321464093893e-05, + "loss": 0.1106, + "step": 1080 + }, + { + "epoch": 0.06996092778459737, + "grad_norm": 5.471112251281738, + "learning_rate": 1.921888468934421e-05, + "loss": 0.114, + "step": 1090 + }, + { + "epoch": 0.07060277115876798, + "grad_norm": 2.482114315032959, + "learning_rate": 1.920562296929912e-05, + "loss": 0.0784, + "step": 1100 + }, + { + "epoch": 0.0712446145329386, + "grad_norm": 6.366470813751221, + "learning_rate": 1.919236124925403e-05, + "loss": 0.1158, + "step": 1110 + }, + { + "epoch": 0.07188645790710922, + "grad_norm": 8.483400344848633, + "learning_rate": 1.917909952920894e-05, + "loss": 0.1098, + "step": 1120 + }, + { + "epoch": 0.07252830128127984, + "grad_norm": 7.386050701141357, + "learning_rate": 1.916583780916385e-05, + "loss": 0.1484, + "step": 1130 + }, + { + "epoch": 0.07317014465545045, + "grad_norm": 4.264742374420166, + "learning_rate": 1.915257608911876e-05, + "loss": 0.1496, + "step": 1140 + }, + { + "epoch": 0.07381198802962108, + "grad_norm": 6.322717666625977, + "learning_rate": 1.9139314369073668e-05, + "loss": 0.1267, + "step": 1150 + }, + { + "epoch": 0.07445383140379169, + "grad_norm": 4.288897514343262, + "learning_rate": 1.9126052649028582e-05, + "loss": 0.1293, + "step": 1160 + }, + { + "epoch": 0.0750956747779623, + "grad_norm": 3.701270818710327, + "learning_rate": 1.911279092898349e-05, + "loss": 0.1303, + "step": 1170 + }, + { + "epoch": 0.07573751815213292, + "grad_norm": 5.492668628692627, + "learning_rate": 1.90995292089384e-05, + "loss": 0.0987, + "step": 1180 + }, + { + "epoch": 0.07637936152630355, + "grad_norm": 4.863729953765869, + "learning_rate": 1.908626748889331e-05, + "loss": 0.1186, + "step": 1190 + }, + { + "epoch": 0.07702120490047416, + "grad_norm": 5.729610443115234, + "learning_rate": 1.907300576884822e-05, + "loss": 0.1224, + "step": 1200 + }, + { + "epoch": 0.07766304827464478, + "grad_norm": 3.333712100982666, + "learning_rate": 1.905974404880313e-05, + "loss": 0.1084, + "step": 1210 + }, + { + "epoch": 0.07830489164881539, + "grad_norm": 9.325702667236328, + "learning_rate": 1.9046482328758043e-05, + "loss": 0.0838, + "step": 1220 + }, + { + "epoch": 0.07894673502298602, + "grad_norm": 4.56976842880249, + "learning_rate": 1.9033220608712954e-05, + "loss": 0.0924, + "step": 1230 + }, + { + "epoch": 0.07958857839715663, + "grad_norm": 5.612900733947754, + "learning_rate": 1.9019958888667862e-05, + "loss": 0.1233, + "step": 1240 + }, + { + "epoch": 0.08023042177132725, + "grad_norm": 5.123150825500488, + "learning_rate": 1.9006697168622773e-05, + "loss": 0.0937, + "step": 1250 + }, + { + "epoch": 0.08087226514549788, + "grad_norm": 5.567653179168701, + "learning_rate": 1.899343544857768e-05, + "loss": 0.0755, + "step": 1260 + }, + { + "epoch": 0.08151410851966849, + "grad_norm": 1.4452855587005615, + "learning_rate": 1.8980173728532592e-05, + "loss": 0.0767, + "step": 1270 + }, + { + "epoch": 0.0821559518938391, + "grad_norm": 3.7622110843658447, + "learning_rate": 1.8966912008487503e-05, + "loss": 0.087, + "step": 1280 + }, + { + "epoch": 0.08279779526800972, + "grad_norm": 1.222861886024475, + "learning_rate": 1.8953650288442414e-05, + "loss": 0.0841, + "step": 1290 + }, + { + "epoch": 0.08343963864218035, + "grad_norm": 5.492730140686035, + "learning_rate": 1.8940388568397322e-05, + "loss": 0.0691, + "step": 1300 + }, + { + "epoch": 0.08408148201635096, + "grad_norm": 4.395420074462891, + "learning_rate": 1.8927126848352233e-05, + "loss": 0.0668, + "step": 1310 + }, + { + "epoch": 0.08472332539052158, + "grad_norm": 6.103671550750732, + "learning_rate": 1.8913865128307144e-05, + "loss": 0.0723, + "step": 1320 + }, + { + "epoch": 0.08536516876469219, + "grad_norm": 5.228280067443848, + "learning_rate": 1.8900603408262052e-05, + "loss": 0.1032, + "step": 1330 + }, + { + "epoch": 0.08600701213886282, + "grad_norm": 1.1769144535064697, + "learning_rate": 1.8887341688216963e-05, + "loss": 0.0815, + "step": 1340 + }, + { + "epoch": 0.08664885551303343, + "grad_norm": 6.19387149810791, + "learning_rate": 1.8874079968171874e-05, + "loss": 0.0913, + "step": 1350 + }, + { + "epoch": 0.08729069888720405, + "grad_norm": 2.781297206878662, + "learning_rate": 1.8860818248126786e-05, + "loss": 0.0752, + "step": 1360 + }, + { + "epoch": 0.08793254226137467, + "grad_norm": 3.160879611968994, + "learning_rate": 1.8847556528081693e-05, + "loss": 0.0811, + "step": 1370 + }, + { + "epoch": 0.08857438563554529, + "grad_norm": 4.549344539642334, + "learning_rate": 1.8834294808036605e-05, + "loss": 0.1094, + "step": 1380 + }, + { + "epoch": 0.0892162290097159, + "grad_norm": 6.160617828369141, + "learning_rate": 1.8821033087991512e-05, + "loss": 0.0858, + "step": 1390 + }, + { + "epoch": 0.08985807238388652, + "grad_norm": 7.712894916534424, + "learning_rate": 1.8807771367946424e-05, + "loss": 0.0855, + "step": 1400 + }, + { + "epoch": 0.09049991575805715, + "grad_norm": 6.89589262008667, + "learning_rate": 1.8794509647901335e-05, + "loss": 0.0782, + "step": 1410 + }, + { + "epoch": 0.09114175913222776, + "grad_norm": 4.766812324523926, + "learning_rate": 1.8781247927856246e-05, + "loss": 0.0917, + "step": 1420 + }, + { + "epoch": 0.09178360250639837, + "grad_norm": 7.581903457641602, + "learning_rate": 1.8767986207811154e-05, + "loss": 0.1413, + "step": 1430 + }, + { + "epoch": 0.09242544588056899, + "grad_norm": 7.758519649505615, + "learning_rate": 1.8754724487766065e-05, + "loss": 0.0994, + "step": 1440 + }, + { + "epoch": 0.09306728925473962, + "grad_norm": 1.8544800281524658, + "learning_rate": 1.8741462767720976e-05, + "loss": 0.0913, + "step": 1450 + }, + { + "epoch": 0.09370913262891023, + "grad_norm": 6.1761674880981445, + "learning_rate": 1.8728201047675884e-05, + "loss": 0.1086, + "step": 1460 + }, + { + "epoch": 0.09435097600308084, + "grad_norm": 6.579728603363037, + "learning_rate": 1.8714939327630795e-05, + "loss": 0.1467, + "step": 1470 + }, + { + "epoch": 0.09499281937725147, + "grad_norm": 6.048295974731445, + "learning_rate": 1.8701677607585706e-05, + "loss": 0.0818, + "step": 1480 + }, + { + "epoch": 0.09563466275142209, + "grad_norm": 5.720274448394775, + "learning_rate": 1.8688415887540617e-05, + "loss": 0.0729, + "step": 1490 + }, + { + "epoch": 0.0962765061255927, + "grad_norm": 8.690102577209473, + "learning_rate": 1.8675154167495525e-05, + "loss": 0.0647, + "step": 1500 + }, + { + "epoch": 0.09691834949976331, + "grad_norm": 3.565424680709839, + "learning_rate": 1.8661892447450436e-05, + "loss": 0.0977, + "step": 1510 + }, + { + "epoch": 0.09756019287393394, + "grad_norm": 8.023870468139648, + "learning_rate": 1.8648630727405347e-05, + "loss": 0.1096, + "step": 1520 + }, + { + "epoch": 0.09820203624810456, + "grad_norm": 5.473697185516357, + "learning_rate": 1.8635369007360255e-05, + "loss": 0.0933, + "step": 1530 + }, + { + "epoch": 0.09884387962227517, + "grad_norm": 1.821287751197815, + "learning_rate": 1.8622107287315166e-05, + "loss": 0.0839, + "step": 1540 + }, + { + "epoch": 0.09948572299644579, + "grad_norm": 6.229073524475098, + "learning_rate": 1.8608845567270078e-05, + "loss": 0.1008, + "step": 1550 + }, + { + "epoch": 0.10012756637061641, + "grad_norm": 5.2744550704956055, + "learning_rate": 1.859558384722499e-05, + "loss": 0.0841, + "step": 1560 + }, + { + "epoch": 0.10076940974478703, + "grad_norm": 0.6284772753715515, + "learning_rate": 1.8582322127179897e-05, + "loss": 0.0721, + "step": 1570 + }, + { + "epoch": 0.10141125311895764, + "grad_norm": 4.158771991729736, + "learning_rate": 1.8569060407134808e-05, + "loss": 0.0918, + "step": 1580 + }, + { + "epoch": 0.10205309649312827, + "grad_norm": 1.499607801437378, + "learning_rate": 1.8555798687089715e-05, + "loss": 0.0836, + "step": 1590 + }, + { + "epoch": 0.10269493986729888, + "grad_norm": 5.547268867492676, + "learning_rate": 1.8542536967044627e-05, + "loss": 0.0638, + "step": 1600 + }, + { + "epoch": 0.1033367832414695, + "grad_norm": 1.7448383569717407, + "learning_rate": 1.8529275246999538e-05, + "loss": 0.0832, + "step": 1610 + }, + { + "epoch": 0.10397862661564011, + "grad_norm": 7.011538028717041, + "learning_rate": 1.851601352695445e-05, + "loss": 0.0867, + "step": 1620 + }, + { + "epoch": 0.10462046998981074, + "grad_norm": 5.350290298461914, + "learning_rate": 1.8502751806909357e-05, + "loss": 0.0698, + "step": 1630 + }, + { + "epoch": 0.10526231336398136, + "grad_norm": 2.6952505111694336, + "learning_rate": 1.8489490086864268e-05, + "loss": 0.0789, + "step": 1640 + }, + { + "epoch": 0.10590415673815197, + "grad_norm": 2.261810779571533, + "learning_rate": 1.847622836681918e-05, + "loss": 0.0778, + "step": 1650 + }, + { + "epoch": 0.10654600011232258, + "grad_norm": 4.009234428405762, + "learning_rate": 1.8462966646774087e-05, + "loss": 0.0621, + "step": 1660 + }, + { + "epoch": 0.10718784348649321, + "grad_norm": 7.318385601043701, + "learning_rate": 1.8449704926728998e-05, + "loss": 0.0589, + "step": 1670 + }, + { + "epoch": 0.10782968686066383, + "grad_norm": 1.8443090915679932, + "learning_rate": 1.843644320668391e-05, + "loss": 0.0832, + "step": 1680 + }, + { + "epoch": 0.10847153023483444, + "grad_norm": 6.533380508422852, + "learning_rate": 1.842318148663882e-05, + "loss": 0.0944, + "step": 1690 + }, + { + "epoch": 0.10911337360900507, + "grad_norm": 7.015242099761963, + "learning_rate": 1.8409919766593728e-05, + "loss": 0.0743, + "step": 1700 + }, + { + "epoch": 0.10975521698317568, + "grad_norm": 5.167893409729004, + "learning_rate": 1.839665804654864e-05, + "loss": 0.0946, + "step": 1710 + }, + { + "epoch": 0.1103970603573463, + "grad_norm": 6.0605058670043945, + "learning_rate": 1.8383396326503547e-05, + "loss": 0.0796, + "step": 1720 + }, + { + "epoch": 0.11103890373151691, + "grad_norm": 6.715466022491455, + "learning_rate": 1.837013460645846e-05, + "loss": 0.0782, + "step": 1730 + }, + { + "epoch": 0.11168074710568754, + "grad_norm": 2.545156955718994, + "learning_rate": 1.835687288641337e-05, + "loss": 0.0713, + "step": 1740 + }, + { + "epoch": 0.11232259047985815, + "grad_norm": 2.0205812454223633, + "learning_rate": 1.834361116636828e-05, + "loss": 0.0783, + "step": 1750 + }, + { + "epoch": 0.11296443385402877, + "grad_norm": 7.414153575897217, + "learning_rate": 1.833034944632319e-05, + "loss": 0.0533, + "step": 1760 + }, + { + "epoch": 0.1136062772281994, + "grad_norm": 1.9587926864624023, + "learning_rate": 1.83170877262781e-05, + "loss": 0.064, + "step": 1770 + }, + { + "epoch": 0.11424812060237001, + "grad_norm": 4.822171211242676, + "learning_rate": 1.830382600623301e-05, + "loss": 0.0537, + "step": 1780 + }, + { + "epoch": 0.11488996397654062, + "grad_norm": 7.510979652404785, + "learning_rate": 1.829056428618792e-05, + "loss": 0.0759, + "step": 1790 + }, + { + "epoch": 0.11553180735071124, + "grad_norm": 2.2131752967834473, + "learning_rate": 1.827730256614283e-05, + "loss": 0.0611, + "step": 1800 + }, + { + "epoch": 0.11617365072488187, + "grad_norm": 6.622641563415527, + "learning_rate": 1.826404084609774e-05, + "loss": 0.0564, + "step": 1810 + }, + { + "epoch": 0.11681549409905248, + "grad_norm": 2.9961423873901367, + "learning_rate": 1.8250779126052652e-05, + "loss": 0.0731, + "step": 1820 + }, + { + "epoch": 0.1174573374732231, + "grad_norm": 0.5653957724571228, + "learning_rate": 1.823751740600756e-05, + "loss": 0.0794, + "step": 1830 + }, + { + "epoch": 0.11809918084739371, + "grad_norm": 4.936248302459717, + "learning_rate": 1.822425568596247e-05, + "loss": 0.0809, + "step": 1840 + }, + { + "epoch": 0.11874102422156434, + "grad_norm": 2.4790220260620117, + "learning_rate": 1.8210993965917382e-05, + "loss": 0.0462, + "step": 1850 + }, + { + "epoch": 0.11938286759573495, + "grad_norm": 5.853545665740967, + "learning_rate": 1.819773224587229e-05, + "loss": 0.0746, + "step": 1860 + }, + { + "epoch": 0.12002471096990557, + "grad_norm": 2.9562971591949463, + "learning_rate": 1.81844705258272e-05, + "loss": 0.0618, + "step": 1870 + }, + { + "epoch": 0.1206665543440762, + "grad_norm": 5.309076309204102, + "learning_rate": 1.8171208805782112e-05, + "loss": 0.0814, + "step": 1880 + }, + { + "epoch": 0.12130839771824681, + "grad_norm": 3.951395273208618, + "learning_rate": 1.8157947085737024e-05, + "loss": 0.0993, + "step": 1890 + }, + { + "epoch": 0.12195024109241742, + "grad_norm": 3.8987228870391846, + "learning_rate": 1.814468536569193e-05, + "loss": 0.0684, + "step": 1900 + }, + { + "epoch": 0.12259208446658804, + "grad_norm": 2.9505648612976074, + "learning_rate": 1.8131423645646842e-05, + "loss": 0.057, + "step": 1910 + }, + { + "epoch": 0.12323392784075866, + "grad_norm": 4.24475622177124, + "learning_rate": 1.811816192560175e-05, + "loss": 0.0593, + "step": 1920 + }, + { + "epoch": 0.12387577121492928, + "grad_norm": 5.541717052459717, + "learning_rate": 1.810490020555666e-05, + "loss": 0.0644, + "step": 1930 + }, + { + "epoch": 0.12451761458909989, + "grad_norm": 4.3561248779296875, + "learning_rate": 1.8091638485511573e-05, + "loss": 0.0463, + "step": 1940 + }, + { + "epoch": 0.12515945796327052, + "grad_norm": 0.40995126962661743, + "learning_rate": 1.8078376765466484e-05, + "loss": 0.0758, + "step": 1950 + }, + { + "epoch": 0.12580130133744113, + "grad_norm": 1.771988868713379, + "learning_rate": 1.806511504542139e-05, + "loss": 0.0722, + "step": 1960 + }, + { + "epoch": 0.12644314471161175, + "grad_norm": 1.8136435747146606, + "learning_rate": 1.8051853325376303e-05, + "loss": 0.0673, + "step": 1970 + }, + { + "epoch": 0.12708498808578236, + "grad_norm": 4.0566277503967285, + "learning_rate": 1.8038591605331214e-05, + "loss": 0.0566, + "step": 1980 + }, + { + "epoch": 0.12772683145995298, + "grad_norm": 4.5531439781188965, + "learning_rate": 1.8025329885286122e-05, + "loss": 0.0653, + "step": 1990 + }, + { + "epoch": 0.1283686748341236, + "grad_norm": 3.038701057434082, + "learning_rate": 1.8012068165241033e-05, + "loss": 0.048, + "step": 2000 + }, + { + "epoch": 0.12901051820829423, + "grad_norm": 10.788609504699707, + "learning_rate": 1.7998806445195944e-05, + "loss": 0.0777, + "step": 2010 + }, + { + "epoch": 0.12965236158246485, + "grad_norm": 7.363852024078369, + "learning_rate": 1.7985544725150855e-05, + "loss": 0.0692, + "step": 2020 + }, + { + "epoch": 0.13029420495663546, + "grad_norm": 4.210390090942383, + "learning_rate": 1.7972283005105763e-05, + "loss": 0.089, + "step": 2030 + }, + { + "epoch": 0.13093604833080608, + "grad_norm": 3.144101858139038, + "learning_rate": 1.7959021285060674e-05, + "loss": 0.0683, + "step": 2040 + }, + { + "epoch": 0.1315778917049767, + "grad_norm": 2.3627800941467285, + "learning_rate": 1.7945759565015582e-05, + "loss": 0.0427, + "step": 2050 + }, + { + "epoch": 0.1322197350791473, + "grad_norm": 3.693427562713623, + "learning_rate": 1.7932497844970493e-05, + "loss": 0.0698, + "step": 2060 + }, + { + "epoch": 0.13286157845331792, + "grad_norm": 6.718416690826416, + "learning_rate": 1.7919236124925404e-05, + "loss": 0.0688, + "step": 2070 + }, + { + "epoch": 0.13350342182748856, + "grad_norm": 3.3696110248565674, + "learning_rate": 1.7905974404880315e-05, + "loss": 0.0684, + "step": 2080 + }, + { + "epoch": 0.13414526520165918, + "grad_norm": 4.205007553100586, + "learning_rate": 1.7892712684835227e-05, + "loss": 0.0947, + "step": 2090 + }, + { + "epoch": 0.1347871085758298, + "grad_norm": 6.637697696685791, + "learning_rate": 1.7879450964790134e-05, + "loss": 0.0672, + "step": 2100 + }, + { + "epoch": 0.1354289519500004, + "grad_norm": 5.024839401245117, + "learning_rate": 1.7866189244745046e-05, + "loss": 0.0548, + "step": 2110 + }, + { + "epoch": 0.13607079532417102, + "grad_norm": 2.4971957206726074, + "learning_rate": 1.7852927524699953e-05, + "loss": 0.0721, + "step": 2120 + }, + { + "epoch": 0.13671263869834163, + "grad_norm": 4.841691017150879, + "learning_rate": 1.7839665804654865e-05, + "loss": 0.0579, + "step": 2130 + }, + { + "epoch": 0.13735448207251225, + "grad_norm": 1.5802949666976929, + "learning_rate": 1.7826404084609776e-05, + "loss": 0.0478, + "step": 2140 + }, + { + "epoch": 0.13799632544668286, + "grad_norm": 3.178330421447754, + "learning_rate": 1.7813142364564687e-05, + "loss": 0.0382, + "step": 2150 + }, + { + "epoch": 0.1386381688208535, + "grad_norm": 1.2376867532730103, + "learning_rate": 1.7799880644519595e-05, + "loss": 0.0493, + "step": 2160 + }, + { + "epoch": 0.13928001219502412, + "grad_norm": 4.373589515686035, + "learning_rate": 1.7786618924474506e-05, + "loss": 0.0722, + "step": 2170 + }, + { + "epoch": 0.13992185556919473, + "grad_norm": 8.756216049194336, + "learning_rate": 1.7773357204429417e-05, + "loss": 0.0661, + "step": 2180 + }, + { + "epoch": 0.14056369894336534, + "grad_norm": 3.211627960205078, + "learning_rate": 1.7760095484384325e-05, + "loss": 0.0808, + "step": 2190 + }, + { + "epoch": 0.14120554231753596, + "grad_norm": 2.4077908992767334, + "learning_rate": 1.7746833764339236e-05, + "loss": 0.0597, + "step": 2200 + }, + { + "epoch": 0.14184738569170657, + "grad_norm": 3.222778081893921, + "learning_rate": 1.7733572044294147e-05, + "loss": 0.0823, + "step": 2210 + }, + { + "epoch": 0.1424892290658772, + "grad_norm": 6.3043742179870605, + "learning_rate": 1.772031032424906e-05, + "loss": 0.053, + "step": 2220 + }, + { + "epoch": 0.14313107244004783, + "grad_norm": 3.8473875522613525, + "learning_rate": 1.7707048604203966e-05, + "loss": 0.0458, + "step": 2230 + }, + { + "epoch": 0.14377291581421844, + "grad_norm": 1.6303006410598755, + "learning_rate": 1.7693786884158877e-05, + "loss": 0.0436, + "step": 2240 + }, + { + "epoch": 0.14441475918838906, + "grad_norm": 6.238514423370361, + "learning_rate": 1.7680525164113785e-05, + "loss": 0.0694, + "step": 2250 + }, + { + "epoch": 0.14505660256255967, + "grad_norm": 1.9263882637023926, + "learning_rate": 1.7667263444068696e-05, + "loss": 0.052, + "step": 2260 + }, + { + "epoch": 0.1456984459367303, + "grad_norm": 5.976217746734619, + "learning_rate": 1.7654001724023607e-05, + "loss": 0.0585, + "step": 2270 + }, + { + "epoch": 0.1463402893109009, + "grad_norm": 2.3731586933135986, + "learning_rate": 1.764074000397852e-05, + "loss": 0.0548, + "step": 2280 + }, + { + "epoch": 0.14698213268507151, + "grad_norm": 2.778264045715332, + "learning_rate": 1.7627478283933426e-05, + "loss": 0.0442, + "step": 2290 + }, + { + "epoch": 0.14762397605924216, + "grad_norm": 2.549161672592163, + "learning_rate": 1.7614216563888338e-05, + "loss": 0.0512, + "step": 2300 + }, + { + "epoch": 0.14826581943341277, + "grad_norm": 2.6218628883361816, + "learning_rate": 1.760095484384325e-05, + "loss": 0.0504, + "step": 2310 + }, + { + "epoch": 0.14890766280758339, + "grad_norm": 2.735814094543457, + "learning_rate": 1.7587693123798157e-05, + "loss": 0.0486, + "step": 2320 + }, + { + "epoch": 0.149549506181754, + "grad_norm": 5.613633155822754, + "learning_rate": 1.7574431403753068e-05, + "loss": 0.0499, + "step": 2330 + }, + { + "epoch": 0.1501913495559246, + "grad_norm": 4.932735443115234, + "learning_rate": 1.756116968370798e-05, + "loss": 0.094, + "step": 2340 + }, + { + "epoch": 0.15083319293009523, + "grad_norm": 1.0233629941940308, + "learning_rate": 1.754790796366289e-05, + "loss": 0.052, + "step": 2350 + }, + { + "epoch": 0.15147503630426584, + "grad_norm": 1.32416832447052, + "learning_rate": 1.7534646243617798e-05, + "loss": 0.0529, + "step": 2360 + }, + { + "epoch": 0.15211687967843646, + "grad_norm": 2.5237696170806885, + "learning_rate": 1.752138452357271e-05, + "loss": 0.054, + "step": 2370 + }, + { + "epoch": 0.1527587230526071, + "grad_norm": 9.520683288574219, + "learning_rate": 1.750812280352762e-05, + "loss": 0.0693, + "step": 2380 + }, + { + "epoch": 0.1534005664267777, + "grad_norm": 5.118333339691162, + "learning_rate": 1.7494861083482528e-05, + "loss": 0.0591, + "step": 2390 + }, + { + "epoch": 0.15404240980094833, + "grad_norm": 1.8272361755371094, + "learning_rate": 1.748159936343744e-05, + "loss": 0.0525, + "step": 2400 + }, + { + "epoch": 0.15468425317511894, + "grad_norm": 5.692248344421387, + "learning_rate": 1.746833764339235e-05, + "loss": 0.0603, + "step": 2410 + }, + { + "epoch": 0.15532609654928956, + "grad_norm": 1.802622675895691, + "learning_rate": 1.745507592334726e-05, + "loss": 0.0453, + "step": 2420 + }, + { + "epoch": 0.15596793992346017, + "grad_norm": 8.569664001464844, + "learning_rate": 1.744181420330217e-05, + "loss": 0.0684, + "step": 2430 + }, + { + "epoch": 0.15660978329763078, + "grad_norm": 1.4416598081588745, + "learning_rate": 1.742855248325708e-05, + "loss": 0.0574, + "step": 2440 + }, + { + "epoch": 0.15725162667180143, + "grad_norm": 1.8214036226272583, + "learning_rate": 1.7415290763211988e-05, + "loss": 0.0694, + "step": 2450 + }, + { + "epoch": 0.15789347004597204, + "grad_norm": 2.4023277759552, + "learning_rate": 1.74020290431669e-05, + "loss": 0.0577, + "step": 2460 + }, + { + "epoch": 0.15853531342014265, + "grad_norm": 9.605439186096191, + "learning_rate": 1.738876732312181e-05, + "loss": 0.0671, + "step": 2470 + }, + { + "epoch": 0.15917715679431327, + "grad_norm": 0.8370853066444397, + "learning_rate": 1.7375505603076722e-05, + "loss": 0.0528, + "step": 2480 + }, + { + "epoch": 0.15981900016848388, + "grad_norm": 3.8083314895629883, + "learning_rate": 1.736224388303163e-05, + "loss": 0.0632, + "step": 2490 + }, + { + "epoch": 0.1604608435426545, + "grad_norm": 4.146091938018799, + "learning_rate": 1.734898216298654e-05, + "loss": 0.048, + "step": 2500 + }, + { + "epoch": 0.1611026869168251, + "grad_norm": 3.8874735832214355, + "learning_rate": 1.7335720442941452e-05, + "loss": 0.0645, + "step": 2510 + }, + { + "epoch": 0.16174453029099575, + "grad_norm": 2.019355535507202, + "learning_rate": 1.732245872289636e-05, + "loss": 0.0493, + "step": 2520 + }, + { + "epoch": 0.16238637366516637, + "grad_norm": 3.533977746963501, + "learning_rate": 1.730919700285127e-05, + "loss": 0.0529, + "step": 2530 + }, + { + "epoch": 0.16302821703933698, + "grad_norm": 3.5773158073425293, + "learning_rate": 1.7295935282806182e-05, + "loss": 0.0462, + "step": 2540 + }, + { + "epoch": 0.1636700604135076, + "grad_norm": 2.2680728435516357, + "learning_rate": 1.7282673562761093e-05, + "loss": 0.0772, + "step": 2550 + }, + { + "epoch": 0.1643119037876782, + "grad_norm": 4.510113716125488, + "learning_rate": 1.7269411842716e-05, + "loss": 0.049, + "step": 2560 + }, + { + "epoch": 0.16495374716184882, + "grad_norm": 6.012620449066162, + "learning_rate": 1.7256150122670912e-05, + "loss": 0.0701, + "step": 2570 + }, + { + "epoch": 0.16559559053601944, + "grad_norm": 4.502586841583252, + "learning_rate": 1.724288840262582e-05, + "loss": 0.058, + "step": 2580 + }, + { + "epoch": 0.16623743391019005, + "grad_norm": 0.7591032981872559, + "learning_rate": 1.722962668258073e-05, + "loss": 0.0219, + "step": 2590 + }, + { + "epoch": 0.1668792772843607, + "grad_norm": 3.2919363975524902, + "learning_rate": 1.7216364962535642e-05, + "loss": 0.0649, + "step": 2600 + }, + { + "epoch": 0.1675211206585313, + "grad_norm": 2.932372570037842, + "learning_rate": 1.7203103242490553e-05, + "loss": 0.0419, + "step": 2610 + }, + { + "epoch": 0.16816296403270192, + "grad_norm": 2.4418764114379883, + "learning_rate": 1.7189841522445465e-05, + "loss": 0.0495, + "step": 2620 + }, + { + "epoch": 0.16880480740687254, + "grad_norm": 3.327172040939331, + "learning_rate": 1.7176579802400372e-05, + "loss": 0.0531, + "step": 2630 + }, + { + "epoch": 0.16944665078104315, + "grad_norm": 3.8956868648529053, + "learning_rate": 1.7163318082355284e-05, + "loss": 0.05, + "step": 2640 + }, + { + "epoch": 0.17008849415521377, + "grad_norm": 2.2222423553466797, + "learning_rate": 1.715005636231019e-05, + "loss": 0.0582, + "step": 2650 + }, + { + "epoch": 0.17073033752938438, + "grad_norm": 2.0240089893341064, + "learning_rate": 1.7136794642265102e-05, + "loss": 0.0755, + "step": 2660 + }, + { + "epoch": 0.17137218090355502, + "grad_norm": 2.35172176361084, + "learning_rate": 1.7123532922220014e-05, + "loss": 0.0713, + "step": 2670 + }, + { + "epoch": 0.17201402427772564, + "grad_norm": 2.8345654010772705, + "learning_rate": 1.7110271202174925e-05, + "loss": 0.0456, + "step": 2680 + }, + { + "epoch": 0.17265586765189625, + "grad_norm": 0.52686607837677, + "learning_rate": 1.7097009482129833e-05, + "loss": 0.0634, + "step": 2690 + }, + { + "epoch": 0.17329771102606686, + "grad_norm": 4.439308166503906, + "learning_rate": 1.7083747762084744e-05, + "loss": 0.0523, + "step": 2700 + }, + { + "epoch": 0.17393955440023748, + "grad_norm": 1.9147131443023682, + "learning_rate": 1.7070486042039655e-05, + "loss": 0.0418, + "step": 2710 + }, + { + "epoch": 0.1745813977744081, + "grad_norm": 0.44640451669692993, + "learning_rate": 1.7057224321994563e-05, + "loss": 0.0447, + "step": 2720 + }, + { + "epoch": 0.1752232411485787, + "grad_norm": 3.7846157550811768, + "learning_rate": 1.7043962601949474e-05, + "loss": 0.0424, + "step": 2730 + }, + { + "epoch": 0.17586508452274935, + "grad_norm": 4.769283294677734, + "learning_rate": 1.7030700881904385e-05, + "loss": 0.0394, + "step": 2740 + }, + { + "epoch": 0.17650692789691996, + "grad_norm": 0.8889997005462646, + "learning_rate": 1.7017439161859296e-05, + "loss": 0.0422, + "step": 2750 + }, + { + "epoch": 0.17714877127109058, + "grad_norm": 2.665426731109619, + "learning_rate": 1.7004177441814204e-05, + "loss": 0.0495, + "step": 2760 + }, + { + "epoch": 0.1777906146452612, + "grad_norm": 2.6268789768218994, + "learning_rate": 1.6990915721769115e-05, + "loss": 0.0503, + "step": 2770 + }, + { + "epoch": 0.1784324580194318, + "grad_norm": 3.4581639766693115, + "learning_rate": 1.6977654001724023e-05, + "loss": 0.0483, + "step": 2780 + }, + { + "epoch": 0.17907430139360242, + "grad_norm": 1.1500879526138306, + "learning_rate": 1.6964392281678934e-05, + "loss": 0.0438, + "step": 2790 + }, + { + "epoch": 0.17971614476777303, + "grad_norm": 6.2053022384643555, + "learning_rate": 1.6951130561633845e-05, + "loss": 0.0591, + "step": 2800 + }, + { + "epoch": 0.18035798814194368, + "grad_norm": 3.427523136138916, + "learning_rate": 1.6937868841588757e-05, + "loss": 0.0602, + "step": 2810 + }, + { + "epoch": 0.1809998315161143, + "grad_norm": 6.429181098937988, + "learning_rate": 1.6924607121543664e-05, + "loss": 0.0462, + "step": 2820 + }, + { + "epoch": 0.1816416748902849, + "grad_norm": 1.9181201457977295, + "learning_rate": 1.6911345401498575e-05, + "loss": 0.0402, + "step": 2830 + }, + { + "epoch": 0.18228351826445552, + "grad_norm": 1.2943888902664185, + "learning_rate": 1.6898083681453487e-05, + "loss": 0.0529, + "step": 2840 + }, + { + "epoch": 0.18292536163862613, + "grad_norm": 3.2357335090637207, + "learning_rate": 1.6884821961408394e-05, + "loss": 0.0726, + "step": 2850 + }, + { + "epoch": 0.18356720501279675, + "grad_norm": 0.8399866223335266, + "learning_rate": 1.6871560241363306e-05, + "loss": 0.034, + "step": 2860 + }, + { + "epoch": 0.18420904838696736, + "grad_norm": 1.1773546934127808, + "learning_rate": 1.6858298521318217e-05, + "loss": 0.0533, + "step": 2870 + }, + { + "epoch": 0.18485089176113798, + "grad_norm": 1.484147071838379, + "learning_rate": 1.6845036801273128e-05, + "loss": 0.0405, + "step": 2880 + }, + { + "epoch": 0.18549273513530862, + "grad_norm": 1.646011233329773, + "learning_rate": 1.6831775081228036e-05, + "loss": 0.0766, + "step": 2890 + }, + { + "epoch": 0.18613457850947923, + "grad_norm": 4.303348064422607, + "learning_rate": 1.6818513361182947e-05, + "loss": 0.0484, + "step": 2900 + }, + { + "epoch": 0.18677642188364985, + "grad_norm": 1.86674165725708, + "learning_rate": 1.6805251641137855e-05, + "loss": 0.0373, + "step": 2910 + }, + { + "epoch": 0.18741826525782046, + "grad_norm": 4.927422523498535, + "learning_rate": 1.6791989921092766e-05, + "loss": 0.059, + "step": 2920 + }, + { + "epoch": 0.18806010863199107, + "grad_norm": 1.2013154029846191, + "learning_rate": 1.6778728201047677e-05, + "loss": 0.0484, + "step": 2930 + }, + { + "epoch": 0.1887019520061617, + "grad_norm": 1.8602125644683838, + "learning_rate": 1.6765466481002588e-05, + "loss": 0.0651, + "step": 2940 + }, + { + "epoch": 0.1893437953803323, + "grad_norm": 2.0056393146514893, + "learning_rate": 1.67522047609575e-05, + "loss": 0.0347, + "step": 2950 + }, + { + "epoch": 0.18998563875450294, + "grad_norm": 4.299374580383301, + "learning_rate": 1.6738943040912407e-05, + "loss": 0.0415, + "step": 2960 + }, + { + "epoch": 0.19062748212867356, + "grad_norm": 1.701195240020752, + "learning_rate": 1.672568132086732e-05, + "loss": 0.0427, + "step": 2970 + }, + { + "epoch": 0.19126932550284417, + "grad_norm": 3.681411027908325, + "learning_rate": 1.6712419600822226e-05, + "loss": 0.037, + "step": 2980 + }, + { + "epoch": 0.1919111688770148, + "grad_norm": 2.3046762943267822, + "learning_rate": 1.6699157880777137e-05, + "loss": 0.0467, + "step": 2990 + }, + { + "epoch": 0.1925530122511854, + "grad_norm": 3.976912260055542, + "learning_rate": 1.668589616073205e-05, + "loss": 0.0516, + "step": 3000 + }, + { + "epoch": 0.19319485562535602, + "grad_norm": 2.6646742820739746, + "learning_rate": 1.667263444068696e-05, + "loss": 0.0407, + "step": 3010 + }, + { + "epoch": 0.19383669899952663, + "grad_norm": 1.8901029825210571, + "learning_rate": 1.6659372720641867e-05, + "loss": 0.0321, + "step": 3020 + }, + { + "epoch": 0.19447854237369727, + "grad_norm": 2.3338727951049805, + "learning_rate": 1.664611100059678e-05, + "loss": 0.0609, + "step": 3030 + }, + { + "epoch": 0.19512038574786789, + "grad_norm": 5.7270121574401855, + "learning_rate": 1.663284928055169e-05, + "loss": 0.0466, + "step": 3040 + }, + { + "epoch": 0.1957622291220385, + "grad_norm": 4.083347320556641, + "learning_rate": 1.6619587560506598e-05, + "loss": 0.0491, + "step": 3050 + }, + { + "epoch": 0.19640407249620911, + "grad_norm": 1.9552839994430542, + "learning_rate": 1.660632584046151e-05, + "loss": 0.0524, + "step": 3060 + }, + { + "epoch": 0.19704591587037973, + "grad_norm": 2.211383104324341, + "learning_rate": 1.659306412041642e-05, + "loss": 0.038, + "step": 3070 + }, + { + "epoch": 0.19768775924455034, + "grad_norm": 0.9306750297546387, + "learning_rate": 1.657980240037133e-05, + "loss": 0.0488, + "step": 3080 + }, + { + "epoch": 0.19832960261872096, + "grad_norm": 2.246899366378784, + "learning_rate": 1.656654068032624e-05, + "loss": 0.0413, + "step": 3090 + }, + { + "epoch": 0.19897144599289157, + "grad_norm": 3.014577627182007, + "learning_rate": 1.655327896028115e-05, + "loss": 0.0397, + "step": 3100 + }, + { + "epoch": 0.1996132893670622, + "grad_norm": 5.5264201164245605, + "learning_rate": 1.6540017240236058e-05, + "loss": 0.0371, + "step": 3110 + }, + { + "epoch": 0.20025513274123283, + "grad_norm": 3.594407320022583, + "learning_rate": 1.652675552019097e-05, + "loss": 0.0415, + "step": 3120 + }, + { + "epoch": 0.20089697611540344, + "grad_norm": 1.3521718978881836, + "learning_rate": 1.651349380014588e-05, + "loss": 0.0588, + "step": 3130 + }, + { + "epoch": 0.20153881948957406, + "grad_norm": 9.015791893005371, + "learning_rate": 1.650023208010079e-05, + "loss": 0.0596, + "step": 3140 + }, + { + "epoch": 0.20218066286374467, + "grad_norm": 2.759474754333496, + "learning_rate": 1.64869703600557e-05, + "loss": 0.0498, + "step": 3150 + }, + { + "epoch": 0.20282250623791528, + "grad_norm": 4.701976776123047, + "learning_rate": 1.647370864001061e-05, + "loss": 0.0404, + "step": 3160 + }, + { + "epoch": 0.2034643496120859, + "grad_norm": 6.130093097686768, + "learning_rate": 1.646044691996552e-05, + "loss": 0.0672, + "step": 3170 + }, + { + "epoch": 0.20410619298625654, + "grad_norm": Infinity, + "learning_rate": 1.644718519992043e-05, + "loss": 0.0585, + "step": 3180 + }, + { + "epoch": 0.20474803636042715, + "grad_norm": 2.636387348175049, + "learning_rate": 1.6433923479875344e-05, + "loss": 0.0632, + "step": 3190 + }, + { + "epoch": 0.20538987973459777, + "grad_norm": 1.0386542081832886, + "learning_rate": 1.642066175983025e-05, + "loss": 0.0323, + "step": 3200 + }, + { + "epoch": 0.20603172310876838, + "grad_norm": 6.843395709991455, + "learning_rate": 1.6407400039785163e-05, + "loss": 0.0596, + "step": 3210 + }, + { + "epoch": 0.206673566482939, + "grad_norm": 4.445530891418457, + "learning_rate": 1.639413831974007e-05, + "loss": 0.0532, + "step": 3220 + }, + { + "epoch": 0.2073154098571096, + "grad_norm": 0.7461674213409424, + "learning_rate": 1.6380876599694982e-05, + "loss": 0.0511, + "step": 3230 + }, + { + "epoch": 0.20795725323128023, + "grad_norm": 7.616549015045166, + "learning_rate": 1.6367614879649893e-05, + "loss": 0.0434, + "step": 3240 + }, + { + "epoch": 0.20859909660545087, + "grad_norm": 1.3540281057357788, + "learning_rate": 1.63543531596048e-05, + "loss": 0.0312, + "step": 3250 + }, + { + "epoch": 0.20924093997962148, + "grad_norm": 0.9516130685806274, + "learning_rate": 1.6341091439559712e-05, + "loss": 0.0552, + "step": 3260 + }, + { + "epoch": 0.2098827833537921, + "grad_norm": 0.5843936800956726, + "learning_rate": 1.6327829719514623e-05, + "loss": 0.0163, + "step": 3270 + }, + { + "epoch": 0.2105246267279627, + "grad_norm": 2.156327486038208, + "learning_rate": 1.6314567999469534e-05, + "loss": 0.0367, + "step": 3280 + }, + { + "epoch": 0.21116647010213332, + "grad_norm": 1.6830333471298218, + "learning_rate": 1.6301306279424442e-05, + "loss": 0.033, + "step": 3290 + }, + { + "epoch": 0.21180831347630394, + "grad_norm": 4.959284782409668, + "learning_rate": 1.6288044559379353e-05, + "loss": 0.0334, + "step": 3300 + }, + { + "epoch": 0.21245015685047455, + "grad_norm": 5.351144313812256, + "learning_rate": 1.627478283933426e-05, + "loss": 0.0266, + "step": 3310 + }, + { + "epoch": 0.21309200022464517, + "grad_norm": 2.5864357948303223, + "learning_rate": 1.6261521119289175e-05, + "loss": 0.0276, + "step": 3320 + }, + { + "epoch": 0.2137338435988158, + "grad_norm": 4.892467021942139, + "learning_rate": 1.6248259399244083e-05, + "loss": 0.0535, + "step": 3330 + }, + { + "epoch": 0.21437568697298642, + "grad_norm": 4.954229354858398, + "learning_rate": 1.6234997679198994e-05, + "loss": 0.0344, + "step": 3340 + }, + { + "epoch": 0.21501753034715704, + "grad_norm": 3.781883955001831, + "learning_rate": 1.6221735959153902e-05, + "loss": 0.0301, + "step": 3350 + }, + { + "epoch": 0.21565937372132765, + "grad_norm": 2.5582525730133057, + "learning_rate": 1.6208474239108813e-05, + "loss": 0.0316, + "step": 3360 + }, + { + "epoch": 0.21630121709549827, + "grad_norm": 4.6287736892700195, + "learning_rate": 1.6195212519063725e-05, + "loss": 0.0445, + "step": 3370 + }, + { + "epoch": 0.21694306046966888, + "grad_norm": 3.5289618968963623, + "learning_rate": 1.6181950799018632e-05, + "loss": 0.0493, + "step": 3380 + }, + { + "epoch": 0.2175849038438395, + "grad_norm": 2.802454710006714, + "learning_rate": 1.6168689078973544e-05, + "loss": 0.031, + "step": 3390 + }, + { + "epoch": 0.21822674721801014, + "grad_norm": 2.012895107269287, + "learning_rate": 1.6155427358928455e-05, + "loss": 0.04, + "step": 3400 + }, + { + "epoch": 0.21886859059218075, + "grad_norm": 1.0591882467269897, + "learning_rate": 1.6142165638883366e-05, + "loss": 0.0432, + "step": 3410 + }, + { + "epoch": 0.21951043396635136, + "grad_norm": 1.5135844945907593, + "learning_rate": 1.6128903918838274e-05, + "loss": 0.0289, + "step": 3420 + }, + { + "epoch": 0.22015227734052198, + "grad_norm": 0.37961867451667786, + "learning_rate": 1.6115642198793185e-05, + "loss": 0.0399, + "step": 3430 + }, + { + "epoch": 0.2207941207146926, + "grad_norm": 0.45994833111763, + "learning_rate": 1.6102380478748093e-05, + "loss": 0.0348, + "step": 3440 + }, + { + "epoch": 0.2214359640888632, + "grad_norm": 2.2288460731506348, + "learning_rate": 1.6089118758703004e-05, + "loss": 0.0483, + "step": 3450 + }, + { + "epoch": 0.22207780746303382, + "grad_norm": 1.327779769897461, + "learning_rate": 1.6075857038657915e-05, + "loss": 0.0448, + "step": 3460 + }, + { + "epoch": 0.22271965083720446, + "grad_norm": 0.829675555229187, + "learning_rate": 1.6062595318612826e-05, + "loss": 0.0497, + "step": 3470 + }, + { + "epoch": 0.22336149421137508, + "grad_norm": 3.0021743774414062, + "learning_rate": 1.6049333598567737e-05, + "loss": 0.0422, + "step": 3480 + }, + { + "epoch": 0.2240033375855457, + "grad_norm": 3.0357425212860107, + "learning_rate": 1.6036071878522645e-05, + "loss": 0.0509, + "step": 3490 + }, + { + "epoch": 0.2246451809597163, + "grad_norm": 1.5336086750030518, + "learning_rate": 1.6022810158477556e-05, + "loss": 0.0457, + "step": 3500 + }, + { + "epoch": 0.22528702433388692, + "grad_norm": 1.462803840637207, + "learning_rate": 1.6009548438432464e-05, + "loss": 0.0188, + "step": 3510 + }, + { + "epoch": 0.22592886770805753, + "grad_norm": 5.490964889526367, + "learning_rate": 1.599628671838738e-05, + "loss": 0.0472, + "step": 3520 + }, + { + "epoch": 0.22657071108222815, + "grad_norm": 5.324558258056641, + "learning_rate": 1.5983024998342286e-05, + "loss": 0.0233, + "step": 3530 + }, + { + "epoch": 0.2272125544563988, + "grad_norm": 3.37449312210083, + "learning_rate": 1.5969763278297198e-05, + "loss": 0.0694, + "step": 3540 + }, + { + "epoch": 0.2278543978305694, + "grad_norm": 2.1609532833099365, + "learning_rate": 1.5956501558252105e-05, + "loss": 0.0357, + "step": 3550 + }, + { + "epoch": 0.22849624120474002, + "grad_norm": 0.04418246075510979, + "learning_rate": 1.5943239838207017e-05, + "loss": 0.0224, + "step": 3560 + }, + { + "epoch": 0.22913808457891063, + "grad_norm": 5.424873352050781, + "learning_rate": 1.5929978118161928e-05, + "loss": 0.043, + "step": 3570 + }, + { + "epoch": 0.22977992795308125, + "grad_norm": 4.027209281921387, + "learning_rate": 1.5916716398116835e-05, + "loss": 0.0266, + "step": 3580 + }, + { + "epoch": 0.23042177132725186, + "grad_norm": 1.5458093881607056, + "learning_rate": 1.5903454678071747e-05, + "loss": 0.0493, + "step": 3590 + }, + { + "epoch": 0.23106361470142248, + "grad_norm": 2.4570200443267822, + "learning_rate": 1.5890192958026658e-05, + "loss": 0.04, + "step": 3600 + }, + { + "epoch": 0.2317054580755931, + "grad_norm": 2.155552625656128, + "learning_rate": 1.587693123798157e-05, + "loss": 0.0494, + "step": 3610 + }, + { + "epoch": 0.23234730144976373, + "grad_norm": 1.6081950664520264, + "learning_rate": 1.5863669517936477e-05, + "loss": 0.0399, + "step": 3620 + }, + { + "epoch": 0.23298914482393435, + "grad_norm": 0.6873506903648376, + "learning_rate": 1.5850407797891388e-05, + "loss": 0.0386, + "step": 3630 + }, + { + "epoch": 0.23363098819810496, + "grad_norm": 4.3657050132751465, + "learning_rate": 1.5837146077846296e-05, + "loss": 0.0418, + "step": 3640 + }, + { + "epoch": 0.23427283157227557, + "grad_norm": 7.361201286315918, + "learning_rate": 1.582388435780121e-05, + "loss": 0.0516, + "step": 3650 + }, + { + "epoch": 0.2349146749464462, + "grad_norm": 5.6615118980407715, + "learning_rate": 1.5810622637756118e-05, + "loss": 0.0575, + "step": 3660 + }, + { + "epoch": 0.2355565183206168, + "grad_norm": 2.371455192565918, + "learning_rate": 1.579736091771103e-05, + "loss": 0.0639, + "step": 3670 + }, + { + "epoch": 0.23619836169478742, + "grad_norm": 1.730076789855957, + "learning_rate": 1.5784099197665937e-05, + "loss": 0.0376, + "step": 3680 + }, + { + "epoch": 0.23684020506895806, + "grad_norm": 3.1411263942718506, + "learning_rate": 1.5770837477620848e-05, + "loss": 0.044, + "step": 3690 + }, + { + "epoch": 0.23748204844312867, + "grad_norm": 0.9767663478851318, + "learning_rate": 1.575757575757576e-05, + "loss": 0.0359, + "step": 3700 + }, + { + "epoch": 0.2381238918172993, + "grad_norm": 0.32298144698143005, + "learning_rate": 1.5744314037530667e-05, + "loss": 0.0371, + "step": 3710 + }, + { + "epoch": 0.2387657351914699, + "grad_norm": 0.5231297016143799, + "learning_rate": 1.5731052317485582e-05, + "loss": 0.0485, + "step": 3720 + }, + { + "epoch": 0.23940757856564052, + "grad_norm": 1.5875322818756104, + "learning_rate": 1.571779059744049e-05, + "loss": 0.0349, + "step": 3730 + }, + { + "epoch": 0.24004942193981113, + "grad_norm": 4.602473735809326, + "learning_rate": 1.57045288773954e-05, + "loss": 0.048, + "step": 3740 + }, + { + "epoch": 0.24069126531398174, + "grad_norm": 3.969989061355591, + "learning_rate": 1.569126715735031e-05, + "loss": 0.0355, + "step": 3750 + }, + { + "epoch": 0.2413331086881524, + "grad_norm": 0.8934308290481567, + "learning_rate": 1.567800543730522e-05, + "loss": 0.0501, + "step": 3760 + }, + { + "epoch": 0.241974952062323, + "grad_norm": 5.02524995803833, + "learning_rate": 1.5664743717260127e-05, + "loss": 0.0579, + "step": 3770 + }, + { + "epoch": 0.24261679543649362, + "grad_norm": 4.93678092956543, + "learning_rate": 1.5651481997215042e-05, + "loss": 0.0429, + "step": 3780 + }, + { + "epoch": 0.24325863881066423, + "grad_norm": 3.8344504833221436, + "learning_rate": 1.563822027716995e-05, + "loss": 0.0421, + "step": 3790 + }, + { + "epoch": 0.24390048218483484, + "grad_norm": 4.208110809326172, + "learning_rate": 1.562495855712486e-05, + "loss": 0.0423, + "step": 3800 + }, + { + "epoch": 0.24454232555900546, + "grad_norm": 5.809897422790527, + "learning_rate": 1.5611696837079772e-05, + "loss": 0.063, + "step": 3810 + }, + { + "epoch": 0.24518416893317607, + "grad_norm": 5.782272815704346, + "learning_rate": 1.559843511703468e-05, + "loss": 0.0523, + "step": 3820 + }, + { + "epoch": 0.2458260123073467, + "grad_norm": 1.3277156352996826, + "learning_rate": 1.558517339698959e-05, + "loss": 0.0345, + "step": 3830 + }, + { + "epoch": 0.24646785568151733, + "grad_norm": 8.235041618347168, + "learning_rate": 1.55719116769445e-05, + "loss": 0.0589, + "step": 3840 + }, + { + "epoch": 0.24710969905568794, + "grad_norm": 2.093726396560669, + "learning_rate": 1.5558649956899413e-05, + "loss": 0.0435, + "step": 3850 + }, + { + "epoch": 0.24775154242985856, + "grad_norm": 6.27501106262207, + "learning_rate": 1.554538823685432e-05, + "loss": 0.0328, + "step": 3860 + }, + { + "epoch": 0.24839338580402917, + "grad_norm": 0.37880775332450867, + "learning_rate": 1.5532126516809232e-05, + "loss": 0.0237, + "step": 3870 + }, + { + "epoch": 0.24903522917819979, + "grad_norm": 2.049382209777832, + "learning_rate": 1.551886479676414e-05, + "loss": 0.0421, + "step": 3880 + }, + { + "epoch": 0.2496770725523704, + "grad_norm": 2.3936824798583984, + "learning_rate": 1.550560307671905e-05, + "loss": 0.0403, + "step": 3890 + }, + { + "epoch": 0.25031891592654104, + "grad_norm": 1.6919853687286377, + "learning_rate": 1.5492341356673962e-05, + "loss": 0.0303, + "step": 3900 + }, + { + "epoch": 0.25096075930071166, + "grad_norm": 7.648709774017334, + "learning_rate": 1.5479079636628874e-05, + "loss": 0.037, + "step": 3910 + }, + { + "epoch": 0.25160260267488227, + "grad_norm": 4.136491298675537, + "learning_rate": 1.546581791658378e-05, + "loss": 0.0607, + "step": 3920 + }, + { + "epoch": 0.2522444460490529, + "grad_norm": 0.24048294126987457, + "learning_rate": 1.5452556196538693e-05, + "loss": 0.0564, + "step": 3930 + }, + { + "epoch": 0.2528862894232235, + "grad_norm": 0.7890238761901855, + "learning_rate": 1.5439294476493604e-05, + "loss": 0.0416, + "step": 3940 + }, + { + "epoch": 0.2535281327973941, + "grad_norm": 5.72561502456665, + "learning_rate": 1.542603275644851e-05, + "loss": 0.0606, + "step": 3950 + }, + { + "epoch": 0.2541699761715647, + "grad_norm": 1.5823456048965454, + "learning_rate": 1.5412771036403423e-05, + "loss": 0.0284, + "step": 3960 + }, + { + "epoch": 0.25481181954573534, + "grad_norm": 2.717170238494873, + "learning_rate": 1.539950931635833e-05, + "loss": 0.0302, + "step": 3970 + }, + { + "epoch": 0.25545366291990595, + "grad_norm": 2.981813669204712, + "learning_rate": 1.5386247596313245e-05, + "loss": 0.0504, + "step": 3980 + }, + { + "epoch": 0.25609550629407657, + "grad_norm": 2.449775457382202, + "learning_rate": 1.5372985876268153e-05, + "loss": 0.025, + "step": 3990 + }, + { + "epoch": 0.2567373496682472, + "grad_norm": 0.42797595262527466, + "learning_rate": 1.5359724156223064e-05, + "loss": 0.0451, + "step": 4000 + }, + { + "epoch": 0.2573791930424178, + "grad_norm": 1.5074821710586548, + "learning_rate": 1.5346462436177972e-05, + "loss": 0.0376, + "step": 4010 + }, + { + "epoch": 0.25802103641658847, + "grad_norm": 4.754857540130615, + "learning_rate": 1.5333200716132883e-05, + "loss": 0.0357, + "step": 4020 + }, + { + "epoch": 0.2586628797907591, + "grad_norm": 4.347653388977051, + "learning_rate": 1.5319938996087794e-05, + "loss": 0.0509, + "step": 4030 + }, + { + "epoch": 0.2593047231649297, + "grad_norm": 4.191555500030518, + "learning_rate": 1.5306677276042702e-05, + "loss": 0.0343, + "step": 4040 + }, + { + "epoch": 0.2599465665391003, + "grad_norm": 3.382261037826538, + "learning_rate": 1.5293415555997617e-05, + "loss": 0.0535, + "step": 4050 + }, + { + "epoch": 0.2605884099132709, + "grad_norm": 3.161862373352051, + "learning_rate": 1.5280153835952524e-05, + "loss": 0.0481, + "step": 4060 + }, + { + "epoch": 0.26123025328744154, + "grad_norm": 1.660260558128357, + "learning_rate": 1.5266892115907435e-05, + "loss": 0.0297, + "step": 4070 + }, + { + "epoch": 0.26187209666161215, + "grad_norm": 0.232618048787117, + "learning_rate": 1.5253630395862345e-05, + "loss": 0.0371, + "step": 4080 + }, + { + "epoch": 0.26251394003578277, + "grad_norm": 3.75004243850708, + "learning_rate": 1.5240368675817254e-05, + "loss": 0.0469, + "step": 4090 + }, + { + "epoch": 0.2631557834099534, + "grad_norm": 3.747248888015747, + "learning_rate": 1.5227106955772164e-05, + "loss": 0.0612, + "step": 4100 + }, + { + "epoch": 0.263797626784124, + "grad_norm": 9.171794891357422, + "learning_rate": 1.5213845235727077e-05, + "loss": 0.0599, + "step": 4110 + }, + { + "epoch": 0.2644394701582946, + "grad_norm": 0.9723399877548218, + "learning_rate": 1.5200583515681986e-05, + "loss": 0.0275, + "step": 4120 + }, + { + "epoch": 0.2650813135324652, + "grad_norm": 1.1156556606292725, + "learning_rate": 1.5187321795636896e-05, + "loss": 0.0306, + "step": 4130 + }, + { + "epoch": 0.26572315690663584, + "grad_norm": 2.5860493183135986, + "learning_rate": 1.5174060075591805e-05, + "loss": 0.0321, + "step": 4140 + }, + { + "epoch": 0.26636500028080645, + "grad_norm": 5.233616828918457, + "learning_rate": 1.5160798355546715e-05, + "loss": 0.0463, + "step": 4150 + }, + { + "epoch": 0.2670068436549771, + "grad_norm": 4.714346408843994, + "learning_rate": 1.5147536635501626e-05, + "loss": 0.029, + "step": 4160 + }, + { + "epoch": 0.26764868702914774, + "grad_norm": 5.325324058532715, + "learning_rate": 1.5134274915456535e-05, + "loss": 0.0323, + "step": 4170 + }, + { + "epoch": 0.26829053040331835, + "grad_norm": 2.1289708614349365, + "learning_rate": 1.5121013195411447e-05, + "loss": 0.0542, + "step": 4180 + }, + { + "epoch": 0.26893237377748896, + "grad_norm": 1.2925282716751099, + "learning_rate": 1.5107751475366356e-05, + "loss": 0.034, + "step": 4190 + }, + { + "epoch": 0.2695742171516596, + "grad_norm": 0.7632297873497009, + "learning_rate": 1.5094489755321267e-05, + "loss": 0.0482, + "step": 4200 + }, + { + "epoch": 0.2702160605258302, + "grad_norm": 1.8542927503585815, + "learning_rate": 1.5081228035276177e-05, + "loss": 0.0441, + "step": 4210 + }, + { + "epoch": 0.2708579039000008, + "grad_norm": 1.7105222940444946, + "learning_rate": 1.5067966315231086e-05, + "loss": 0.0316, + "step": 4220 + }, + { + "epoch": 0.2714997472741714, + "grad_norm": 0.39926835894584656, + "learning_rate": 1.5054704595185996e-05, + "loss": 0.0362, + "step": 4230 + }, + { + "epoch": 0.27214159064834204, + "grad_norm": 1.7719646692276, + "learning_rate": 1.5041442875140908e-05, + "loss": 0.0351, + "step": 4240 + }, + { + "epoch": 0.27278343402251265, + "grad_norm": 1.2764171361923218, + "learning_rate": 1.5028181155095818e-05, + "loss": 0.0344, + "step": 4250 + }, + { + "epoch": 0.27342527739668326, + "grad_norm": 0.1933438628911972, + "learning_rate": 1.5014919435050727e-05, + "loss": 0.0301, + "step": 4260 + }, + { + "epoch": 0.2740671207708539, + "grad_norm": 0.7615514397621155, + "learning_rate": 1.5001657715005637e-05, + "loss": 0.0457, + "step": 4270 + }, + { + "epoch": 0.2747089641450245, + "grad_norm": 1.1360198259353638, + "learning_rate": 1.4988395994960548e-05, + "loss": 0.0301, + "step": 4280 + }, + { + "epoch": 0.2753508075191951, + "grad_norm": 2.9905829429626465, + "learning_rate": 1.4975134274915458e-05, + "loss": 0.0341, + "step": 4290 + }, + { + "epoch": 0.2759926508933657, + "grad_norm": 1.4750293493270874, + "learning_rate": 1.4961872554870367e-05, + "loss": 0.0333, + "step": 4300 + }, + { + "epoch": 0.2766344942675364, + "grad_norm": 0.11904127895832062, + "learning_rate": 1.4948610834825278e-05, + "loss": 0.0293, + "step": 4310 + }, + { + "epoch": 0.277276337641707, + "grad_norm": 2.860715389251709, + "learning_rate": 1.493534911478019e-05, + "loss": 0.0262, + "step": 4320 + }, + { + "epoch": 0.2779181810158776, + "grad_norm": 3.2877347469329834, + "learning_rate": 1.4922087394735099e-05, + "loss": 0.0259, + "step": 4330 + }, + { + "epoch": 0.27856002439004823, + "grad_norm": 0.1747274398803711, + "learning_rate": 1.4908825674690008e-05, + "loss": 0.0203, + "step": 4340 + }, + { + "epoch": 0.27920186776421885, + "grad_norm": 1.7592897415161133, + "learning_rate": 1.4895563954644918e-05, + "loss": 0.0332, + "step": 4350 + }, + { + "epoch": 0.27984371113838946, + "grad_norm": 3.454744338989258, + "learning_rate": 1.4882302234599827e-05, + "loss": 0.0387, + "step": 4360 + }, + { + "epoch": 0.2804855545125601, + "grad_norm": 2.7865166664123535, + "learning_rate": 1.486904051455474e-05, + "loss": 0.026, + "step": 4370 + }, + { + "epoch": 0.2811273978867307, + "grad_norm": 1.9861980676651, + "learning_rate": 1.485577879450965e-05, + "loss": 0.0349, + "step": 4380 + }, + { + "epoch": 0.2817692412609013, + "grad_norm": 1.646071434020996, + "learning_rate": 1.4842517074464559e-05, + "loss": 0.0297, + "step": 4390 + }, + { + "epoch": 0.2824110846350719, + "grad_norm": 0.5253218412399292, + "learning_rate": 1.482925535441947e-05, + "loss": 0.0311, + "step": 4400 + }, + { + "epoch": 0.28305292800924253, + "grad_norm": 0.11210356652736664, + "learning_rate": 1.481599363437438e-05, + "loss": 0.0344, + "step": 4410 + }, + { + "epoch": 0.28369477138341315, + "grad_norm": 6.261229038238525, + "learning_rate": 1.480273191432929e-05, + "loss": 0.0278, + "step": 4420 + }, + { + "epoch": 0.28433661475758376, + "grad_norm": 0.5738747715950012, + "learning_rate": 1.4789470194284199e-05, + "loss": 0.038, + "step": 4430 + }, + { + "epoch": 0.2849784581317544, + "grad_norm": 1.1812382936477661, + "learning_rate": 1.4776208474239112e-05, + "loss": 0.0408, + "step": 4440 + }, + { + "epoch": 0.285620301505925, + "grad_norm": 6.855234146118164, + "learning_rate": 1.4762946754194021e-05, + "loss": 0.0403, + "step": 4450 + }, + { + "epoch": 0.28626214488009566, + "grad_norm": 0.6492354273796082, + "learning_rate": 1.474968503414893e-05, + "loss": 0.0487, + "step": 4460 + }, + { + "epoch": 0.2869039882542663, + "grad_norm": 7.180405139923096, + "learning_rate": 1.473642331410384e-05, + "loss": 0.0401, + "step": 4470 + }, + { + "epoch": 0.2875458316284369, + "grad_norm": 2.133906602859497, + "learning_rate": 1.472316159405875e-05, + "loss": 0.0323, + "step": 4480 + }, + { + "epoch": 0.2881876750026075, + "grad_norm": 4.206572532653809, + "learning_rate": 1.470989987401366e-05, + "loss": 0.036, + "step": 4490 + }, + { + "epoch": 0.2888295183767781, + "grad_norm": 4.9672417640686035, + "learning_rate": 1.469663815396857e-05, + "loss": 0.0449, + "step": 4500 + }, + { + "epoch": 0.28947136175094873, + "grad_norm": 3.2657997608184814, + "learning_rate": 1.4683376433923481e-05, + "loss": 0.0287, + "step": 4510 + }, + { + "epoch": 0.29011320512511934, + "grad_norm": 0.8293973207473755, + "learning_rate": 1.4670114713878392e-05, + "loss": 0.0585, + "step": 4520 + }, + { + "epoch": 0.29075504849928996, + "grad_norm": 3.678178071975708, + "learning_rate": 1.4656852993833302e-05, + "loss": 0.0351, + "step": 4530 + }, + { + "epoch": 0.2913968918734606, + "grad_norm": 0.9538152813911438, + "learning_rate": 1.4643591273788211e-05, + "loss": 0.0496, + "step": 4540 + }, + { + "epoch": 0.2920387352476312, + "grad_norm": 1.1784247159957886, + "learning_rate": 1.4630329553743121e-05, + "loss": 0.0353, + "step": 4550 + }, + { + "epoch": 0.2926805786218018, + "grad_norm": 1.781537413597107, + "learning_rate": 1.461706783369803e-05, + "loss": 0.0378, + "step": 4560 + }, + { + "epoch": 0.2933224219959724, + "grad_norm": 2.990455150604248, + "learning_rate": 1.4603806113652943e-05, + "loss": 0.0408, + "step": 4570 + }, + { + "epoch": 0.29396426537014303, + "grad_norm": 5.851528167724609, + "learning_rate": 1.4590544393607853e-05, + "loss": 0.0328, + "step": 4580 + }, + { + "epoch": 0.29460610874431364, + "grad_norm": 0.7276738882064819, + "learning_rate": 1.4577282673562762e-05, + "loss": 0.0546, + "step": 4590 + }, + { + "epoch": 0.2952479521184843, + "grad_norm": 1.7559603452682495, + "learning_rate": 1.4564020953517672e-05, + "loss": 0.0305, + "step": 4600 + }, + { + "epoch": 0.29588979549265493, + "grad_norm": 2.0152103900909424, + "learning_rate": 1.4550759233472583e-05, + "loss": 0.0302, + "step": 4610 + }, + { + "epoch": 0.29653163886682554, + "grad_norm": 0.8355568647384644, + "learning_rate": 1.4537497513427492e-05, + "loss": 0.0314, + "step": 4620 + }, + { + "epoch": 0.29717348224099616, + "grad_norm": 1.1026408672332764, + "learning_rate": 1.4524235793382402e-05, + "loss": 0.036, + "step": 4630 + }, + { + "epoch": 0.29781532561516677, + "grad_norm": 4.210587024688721, + "learning_rate": 1.4510974073337315e-05, + "loss": 0.0299, + "step": 4640 + }, + { + "epoch": 0.2984571689893374, + "grad_norm": 2.904656171798706, + "learning_rate": 1.4497712353292224e-05, + "loss": 0.039, + "step": 4650 + }, + { + "epoch": 0.299099012363508, + "grad_norm": 3.1882083415985107, + "learning_rate": 1.4484450633247134e-05, + "loss": 0.0524, + "step": 4660 + }, + { + "epoch": 0.2997408557376786, + "grad_norm": 1.4921969175338745, + "learning_rate": 1.4471188913202043e-05, + "loss": 0.0407, + "step": 4670 + }, + { + "epoch": 0.3003826991118492, + "grad_norm": 0.5533859729766846, + "learning_rate": 1.4457927193156953e-05, + "loss": 0.0419, + "step": 4680 + }, + { + "epoch": 0.30102454248601984, + "grad_norm": 0.90987628698349, + "learning_rate": 1.4444665473111864e-05, + "loss": 0.0489, + "step": 4690 + }, + { + "epoch": 0.30166638586019046, + "grad_norm": 4.835458755493164, + "learning_rate": 1.4431403753066775e-05, + "loss": 0.0305, + "step": 4700 + }, + { + "epoch": 0.30230822923436107, + "grad_norm": 1.4399257898330688, + "learning_rate": 1.4418142033021684e-05, + "loss": 0.0261, + "step": 4710 + }, + { + "epoch": 0.3029500726085317, + "grad_norm": 0.8075177073478699, + "learning_rate": 1.4404880312976594e-05, + "loss": 0.039, + "step": 4720 + }, + { + "epoch": 0.3035919159827023, + "grad_norm": 0.26249435544013977, + "learning_rate": 1.4391618592931505e-05, + "loss": 0.0238, + "step": 4730 + }, + { + "epoch": 0.3042337593568729, + "grad_norm": 2.859839677810669, + "learning_rate": 1.4378356872886415e-05, + "loss": 0.0348, + "step": 4740 + }, + { + "epoch": 0.3048756027310436, + "grad_norm": 5.632012844085693, + "learning_rate": 1.4365095152841324e-05, + "loss": 0.0601, + "step": 4750 + }, + { + "epoch": 0.3055174461052142, + "grad_norm": 0.5732155442237854, + "learning_rate": 1.4351833432796234e-05, + "loss": 0.0209, + "step": 4760 + }, + { + "epoch": 0.3061592894793848, + "grad_norm": 3.579094171524048, + "learning_rate": 1.4338571712751146e-05, + "loss": 0.0273, + "step": 4770 + }, + { + "epoch": 0.3068011328535554, + "grad_norm": 3.8557450771331787, + "learning_rate": 1.4325309992706056e-05, + "loss": 0.0309, + "step": 4780 + }, + { + "epoch": 0.30744297622772604, + "grad_norm": 1.1692198514938354, + "learning_rate": 1.4312048272660965e-05, + "loss": 0.0249, + "step": 4790 + }, + { + "epoch": 0.30808481960189665, + "grad_norm": 5.022751331329346, + "learning_rate": 1.4298786552615875e-05, + "loss": 0.0445, + "step": 4800 + }, + { + "epoch": 0.30872666297606727, + "grad_norm": 0.4565708041191101, + "learning_rate": 1.4285524832570784e-05, + "loss": 0.0185, + "step": 4810 + }, + { + "epoch": 0.3093685063502379, + "grad_norm": 0.2485363483428955, + "learning_rate": 1.4272263112525695e-05, + "loss": 0.0334, + "step": 4820 + }, + { + "epoch": 0.3100103497244085, + "grad_norm": 0.2797245681285858, + "learning_rate": 1.4259001392480607e-05, + "loss": 0.0408, + "step": 4830 + }, + { + "epoch": 0.3106521930985791, + "grad_norm": 3.8306257724761963, + "learning_rate": 1.4245739672435516e-05, + "loss": 0.0486, + "step": 4840 + }, + { + "epoch": 0.3112940364727497, + "grad_norm": 3.26303768157959, + "learning_rate": 1.4232477952390427e-05, + "loss": 0.0245, + "step": 4850 + }, + { + "epoch": 0.31193587984692034, + "grad_norm": 0.26074540615081787, + "learning_rate": 1.4219216232345337e-05, + "loss": 0.0282, + "step": 4860 + }, + { + "epoch": 0.31257772322109095, + "grad_norm": 4.135842800140381, + "learning_rate": 1.4205954512300246e-05, + "loss": 0.0367, + "step": 4870 + }, + { + "epoch": 0.31321956659526157, + "grad_norm": 2.937664031982422, + "learning_rate": 1.4192692792255156e-05, + "loss": 0.0286, + "step": 4880 + }, + { + "epoch": 0.31386140996943224, + "grad_norm": 0.9610151052474976, + "learning_rate": 1.4179431072210065e-05, + "loss": 0.0262, + "step": 4890 + }, + { + "epoch": 0.31450325334360285, + "grad_norm": 1.348313570022583, + "learning_rate": 1.4166169352164978e-05, + "loss": 0.0471, + "step": 4900 + }, + { + "epoch": 0.31514509671777347, + "grad_norm": 1.0628247261047363, + "learning_rate": 1.4152907632119888e-05, + "loss": 0.0278, + "step": 4910 + }, + { + "epoch": 0.3157869400919441, + "grad_norm": 2.1277549266815186, + "learning_rate": 1.4139645912074797e-05, + "loss": 0.0422, + "step": 4920 + }, + { + "epoch": 0.3164287834661147, + "grad_norm": 4.375251293182373, + "learning_rate": 1.4126384192029707e-05, + "loss": 0.0448, + "step": 4930 + }, + { + "epoch": 0.3170706268402853, + "grad_norm": 2.4289703369140625, + "learning_rate": 1.4113122471984618e-05, + "loss": 0.0291, + "step": 4940 + }, + { + "epoch": 0.3177124702144559, + "grad_norm": 1.17922043800354, + "learning_rate": 1.4099860751939527e-05, + "loss": 0.024, + "step": 4950 + }, + { + "epoch": 0.31835431358862654, + "grad_norm": 0.7155043482780457, + "learning_rate": 1.4086599031894437e-05, + "loss": 0.0164, + "step": 4960 + }, + { + "epoch": 0.31899615696279715, + "grad_norm": 2.958674907684326, + "learning_rate": 1.407333731184935e-05, + "loss": 0.0408, + "step": 4970 + }, + { + "epoch": 0.31963800033696776, + "grad_norm": 1.0707948207855225, + "learning_rate": 1.4060075591804259e-05, + "loss": 0.0351, + "step": 4980 + }, + { + "epoch": 0.3202798437111384, + "grad_norm": 9.325459480285645, + "learning_rate": 1.4046813871759168e-05, + "loss": 0.0592, + "step": 4990 + }, + { + "epoch": 0.320921687085309, + "grad_norm": 0.5737670660018921, + "learning_rate": 1.4033552151714078e-05, + "loss": 0.0245, + "step": 5000 + }, + { + "epoch": 0.3215635304594796, + "grad_norm": 1.7294018268585205, + "learning_rate": 1.4020290431668987e-05, + "loss": 0.0341, + "step": 5010 + }, + { + "epoch": 0.3222053738336502, + "grad_norm": 3.3348608016967773, + "learning_rate": 1.4007028711623899e-05, + "loss": 0.0312, + "step": 5020 + }, + { + "epoch": 0.32284721720782084, + "grad_norm": 1.2371169328689575, + "learning_rate": 1.399376699157881e-05, + "loss": 0.0314, + "step": 5030 + }, + { + "epoch": 0.3234890605819915, + "grad_norm": 0.5545613765716553, + "learning_rate": 1.398050527153372e-05, + "loss": 0.0248, + "step": 5040 + }, + { + "epoch": 0.3241309039561621, + "grad_norm": 0.1487692892551422, + "learning_rate": 1.3967243551488629e-05, + "loss": 0.0245, + "step": 5050 + }, + { + "epoch": 0.32477274733033273, + "grad_norm": 5.844099044799805, + "learning_rate": 1.395398183144354e-05, + "loss": 0.0333, + "step": 5060 + }, + { + "epoch": 0.32541459070450335, + "grad_norm": 1.8279471397399902, + "learning_rate": 1.394072011139845e-05, + "loss": 0.0327, + "step": 5070 + }, + { + "epoch": 0.32605643407867396, + "grad_norm": 4.413207054138184, + "learning_rate": 1.3927458391353359e-05, + "loss": 0.0415, + "step": 5080 + }, + { + "epoch": 0.3266982774528446, + "grad_norm": 4.457259178161621, + "learning_rate": 1.3914196671308268e-05, + "loss": 0.0412, + "step": 5090 + }, + { + "epoch": 0.3273401208270152, + "grad_norm": 0.5590922236442566, + "learning_rate": 1.3900934951263181e-05, + "loss": 0.0333, + "step": 5100 + }, + { + "epoch": 0.3279819642011858, + "grad_norm": 3.6390750408172607, + "learning_rate": 1.388767323121809e-05, + "loss": 0.0509, + "step": 5110 + }, + { + "epoch": 0.3286238075753564, + "grad_norm": 1.944118618965149, + "learning_rate": 1.3874411511173e-05, + "loss": 0.0313, + "step": 5120 + }, + { + "epoch": 0.32926565094952703, + "grad_norm": 1.424323558807373, + "learning_rate": 1.386114979112791e-05, + "loss": 0.026, + "step": 5130 + }, + { + "epoch": 0.32990749432369765, + "grad_norm": 2.957878351211548, + "learning_rate": 1.384788807108282e-05, + "loss": 0.0238, + "step": 5140 + }, + { + "epoch": 0.33054933769786826, + "grad_norm": 1.0304313898086548, + "learning_rate": 1.383462635103773e-05, + "loss": 0.0261, + "step": 5150 + }, + { + "epoch": 0.3311911810720389, + "grad_norm": 2.29116153717041, + "learning_rate": 1.3821364630992641e-05, + "loss": 0.0362, + "step": 5160 + }, + { + "epoch": 0.3318330244462095, + "grad_norm": 1.6272333860397339, + "learning_rate": 1.3808102910947551e-05, + "loss": 0.0172, + "step": 5170 + }, + { + "epoch": 0.3324748678203801, + "grad_norm": 0.1753595918416977, + "learning_rate": 1.3794841190902462e-05, + "loss": 0.033, + "step": 5180 + }, + { + "epoch": 0.3331167111945508, + "grad_norm": 0.24735307693481445, + "learning_rate": 1.3781579470857372e-05, + "loss": 0.0464, + "step": 5190 + }, + { + "epoch": 0.3337585545687214, + "grad_norm": 1.140425205230713, + "learning_rate": 1.3768317750812281e-05, + "loss": 0.02, + "step": 5200 + }, + { + "epoch": 0.334400397942892, + "grad_norm": 4.770532131195068, + "learning_rate": 1.375505603076719e-05, + "loss": 0.0331, + "step": 5210 + }, + { + "epoch": 0.3350422413170626, + "grad_norm": 1.9777387380599976, + "learning_rate": 1.37417943107221e-05, + "loss": 0.0236, + "step": 5220 + }, + { + "epoch": 0.33568408469123323, + "grad_norm": 5.254188537597656, + "learning_rate": 1.3728532590677013e-05, + "loss": 0.0427, + "step": 5230 + }, + { + "epoch": 0.33632592806540385, + "grad_norm": 1.9346436262130737, + "learning_rate": 1.3715270870631922e-05, + "loss": 0.0456, + "step": 5240 + }, + { + "epoch": 0.33696777143957446, + "grad_norm": 5.422431945800781, + "learning_rate": 1.3702009150586832e-05, + "loss": 0.043, + "step": 5250 + }, + { + "epoch": 0.3376096148137451, + "grad_norm": 4.341948986053467, + "learning_rate": 1.3688747430541743e-05, + "loss": 0.0433, + "step": 5260 + }, + { + "epoch": 0.3382514581879157, + "grad_norm": 0.7508522868156433, + "learning_rate": 1.3675485710496652e-05, + "loss": 0.0302, + "step": 5270 + }, + { + "epoch": 0.3388933015620863, + "grad_norm": 2.358661651611328, + "learning_rate": 1.3662223990451562e-05, + "loss": 0.0213, + "step": 5280 + }, + { + "epoch": 0.3395351449362569, + "grad_norm": 0.9036428332328796, + "learning_rate": 1.3648962270406473e-05, + "loss": 0.0288, + "step": 5290 + }, + { + "epoch": 0.34017698831042753, + "grad_norm": 2.609917640686035, + "learning_rate": 1.3635700550361384e-05, + "loss": 0.0263, + "step": 5300 + }, + { + "epoch": 0.34081883168459814, + "grad_norm": 0.23868480324745178, + "learning_rate": 1.3622438830316294e-05, + "loss": 0.0103, + "step": 5310 + }, + { + "epoch": 0.34146067505876876, + "grad_norm": 5.040385723114014, + "learning_rate": 1.3609177110271203e-05, + "loss": 0.0375, + "step": 5320 + }, + { + "epoch": 0.34210251843293943, + "grad_norm": 1.8747702836990356, + "learning_rate": 1.3595915390226113e-05, + "loss": 0.0234, + "step": 5330 + }, + { + "epoch": 0.34274436180711004, + "grad_norm": 3.535433053970337, + "learning_rate": 1.3582653670181022e-05, + "loss": 0.0337, + "step": 5340 + }, + { + "epoch": 0.34338620518128066, + "grad_norm": 1.0234447717666626, + "learning_rate": 1.3569391950135933e-05, + "loss": 0.0278, + "step": 5350 + }, + { + "epoch": 0.34402804855545127, + "grad_norm": 4.323075771331787, + "learning_rate": 1.3556130230090845e-05, + "loss": 0.0292, + "step": 5360 + }, + { + "epoch": 0.3446698919296219, + "grad_norm": 6.044609069824219, + "learning_rate": 1.3542868510045754e-05, + "loss": 0.0336, + "step": 5370 + }, + { + "epoch": 0.3453117353037925, + "grad_norm": 0.4520716369152069, + "learning_rate": 1.3529606790000665e-05, + "loss": 0.0386, + "step": 5380 + }, + { + "epoch": 0.3459535786779631, + "grad_norm": 1.0738428831100464, + "learning_rate": 1.3516345069955575e-05, + "loss": 0.0349, + "step": 5390 + }, + { + "epoch": 0.34659542205213373, + "grad_norm": 4.519723415374756, + "learning_rate": 1.3503083349910484e-05, + "loss": 0.0434, + "step": 5400 + }, + { + "epoch": 0.34723726542630434, + "grad_norm": 1.12299382686615, + "learning_rate": 1.3489821629865394e-05, + "loss": 0.0409, + "step": 5410 + }, + { + "epoch": 0.34787910880047496, + "grad_norm": 1.1750973463058472, + "learning_rate": 1.3476559909820303e-05, + "loss": 0.0175, + "step": 5420 + }, + { + "epoch": 0.34852095217464557, + "grad_norm": 0.6646749973297119, + "learning_rate": 1.3463298189775216e-05, + "loss": 0.0244, + "step": 5430 + }, + { + "epoch": 0.3491627955488162, + "grad_norm": 0.40298768877983093, + "learning_rate": 1.3450036469730125e-05, + "loss": 0.0322, + "step": 5440 + }, + { + "epoch": 0.3498046389229868, + "grad_norm": 5.02630090713501, + "learning_rate": 1.3436774749685035e-05, + "loss": 0.0469, + "step": 5450 + }, + { + "epoch": 0.3504464822971574, + "grad_norm": 0.8722269535064697, + "learning_rate": 1.3423513029639944e-05, + "loss": 0.0376, + "step": 5460 + }, + { + "epoch": 0.351088325671328, + "grad_norm": 0.7096695303916931, + "learning_rate": 1.3410251309594856e-05, + "loss": 0.0206, + "step": 5470 + }, + { + "epoch": 0.3517301690454987, + "grad_norm": 0.901951014995575, + "learning_rate": 1.3396989589549765e-05, + "loss": 0.0215, + "step": 5480 + }, + { + "epoch": 0.3523720124196693, + "grad_norm": 0.25191473960876465, + "learning_rate": 1.3383727869504676e-05, + "loss": 0.0231, + "step": 5490 + }, + { + "epoch": 0.3530138557938399, + "grad_norm": 4.8230180740356445, + "learning_rate": 1.3370466149459587e-05, + "loss": 0.0414, + "step": 5500 + }, + { + "epoch": 0.35365569916801054, + "grad_norm": 3.088327169418335, + "learning_rate": 1.3357204429414497e-05, + "loss": 0.0278, + "step": 5510 + }, + { + "epoch": 0.35429754254218115, + "grad_norm": 1.8572773933410645, + "learning_rate": 1.3343942709369406e-05, + "loss": 0.0201, + "step": 5520 + }, + { + "epoch": 0.35493938591635177, + "grad_norm": 4.562084674835205, + "learning_rate": 1.3330680989324316e-05, + "loss": 0.0196, + "step": 5530 + }, + { + "epoch": 0.3555812292905224, + "grad_norm": 0.2433650940656662, + "learning_rate": 1.3317419269279225e-05, + "loss": 0.0395, + "step": 5540 + }, + { + "epoch": 0.356223072664693, + "grad_norm": 3.763524055480957, + "learning_rate": 1.3304157549234137e-05, + "loss": 0.0419, + "step": 5550 + }, + { + "epoch": 0.3568649160388636, + "grad_norm": 1.4221765995025635, + "learning_rate": 1.3290895829189048e-05, + "loss": 0.0238, + "step": 5560 + }, + { + "epoch": 0.3575067594130342, + "grad_norm": 0.5821696519851685, + "learning_rate": 1.3277634109143957e-05, + "loss": 0.0279, + "step": 5570 + }, + { + "epoch": 0.35814860278720484, + "grad_norm": 2.7642388343811035, + "learning_rate": 1.3264372389098867e-05, + "loss": 0.0429, + "step": 5580 + }, + { + "epoch": 0.35879044616137545, + "grad_norm": 5.3998565673828125, + "learning_rate": 1.3251110669053778e-05, + "loss": 0.031, + "step": 5590 + }, + { + "epoch": 0.35943228953554607, + "grad_norm": 0.39380407333374023, + "learning_rate": 1.3237848949008687e-05, + "loss": 0.0312, + "step": 5600 + }, + { + "epoch": 0.3600741329097167, + "grad_norm": 0.33455953001976013, + "learning_rate": 1.3224587228963597e-05, + "loss": 0.0281, + "step": 5610 + }, + { + "epoch": 0.36071597628388735, + "grad_norm": 1.693926453590393, + "learning_rate": 1.321132550891851e-05, + "loss": 0.0218, + "step": 5620 + }, + { + "epoch": 0.36135781965805797, + "grad_norm": 0.2952551245689392, + "learning_rate": 1.3198063788873419e-05, + "loss": 0.0291, + "step": 5630 + }, + { + "epoch": 0.3619996630322286, + "grad_norm": 1.343604326248169, + "learning_rate": 1.3184802068828329e-05, + "loss": 0.0385, + "step": 5640 + }, + { + "epoch": 0.3626415064063992, + "grad_norm": 0.8492736220359802, + "learning_rate": 1.3171540348783238e-05, + "loss": 0.0367, + "step": 5650 + }, + { + "epoch": 0.3632833497805698, + "grad_norm": 1.1233781576156616, + "learning_rate": 1.3158278628738148e-05, + "loss": 0.0181, + "step": 5660 + }, + { + "epoch": 0.3639251931547404, + "grad_norm": 0.07301205396652222, + "learning_rate": 1.3145016908693059e-05, + "loss": 0.0211, + "step": 5670 + }, + { + "epoch": 0.36456703652891104, + "grad_norm": 2.1094229221343994, + "learning_rate": 1.3131755188647968e-05, + "loss": 0.0218, + "step": 5680 + }, + { + "epoch": 0.36520887990308165, + "grad_norm": 0.391240656375885, + "learning_rate": 1.311849346860288e-05, + "loss": 0.0343, + "step": 5690 + }, + { + "epoch": 0.36585072327725227, + "grad_norm": 0.3236871063709259, + "learning_rate": 1.3105231748557789e-05, + "loss": 0.0285, + "step": 5700 + }, + { + "epoch": 0.3664925666514229, + "grad_norm": 0.9702367186546326, + "learning_rate": 1.30919700285127e-05, + "loss": 0.0343, + "step": 5710 + }, + { + "epoch": 0.3671344100255935, + "grad_norm": 0.8269650936126709, + "learning_rate": 1.307870830846761e-05, + "loss": 0.0289, + "step": 5720 + }, + { + "epoch": 0.3677762533997641, + "grad_norm": 1.4295783042907715, + "learning_rate": 1.3065446588422519e-05, + "loss": 0.0367, + "step": 5730 + }, + { + "epoch": 0.3684180967739347, + "grad_norm": 1.9564656019210815, + "learning_rate": 1.3052184868377428e-05, + "loss": 0.0265, + "step": 5740 + }, + { + "epoch": 0.36905994014810534, + "grad_norm": 0.7807095050811768, + "learning_rate": 1.3038923148332341e-05, + "loss": 0.0476, + "step": 5750 + }, + { + "epoch": 0.36970178352227595, + "grad_norm": 1.0274103879928589, + "learning_rate": 1.302566142828725e-05, + "loss": 0.028, + "step": 5760 + }, + { + "epoch": 0.3703436268964466, + "grad_norm": 3.565110921859741, + "learning_rate": 1.301239970824216e-05, + "loss": 0.0322, + "step": 5770 + }, + { + "epoch": 0.37098547027061723, + "grad_norm": 1.3256715536117554, + "learning_rate": 1.299913798819707e-05, + "loss": 0.0332, + "step": 5780 + }, + { + "epoch": 0.37162731364478785, + "grad_norm": 2.2184195518493652, + "learning_rate": 1.2985876268151981e-05, + "loss": 0.0281, + "step": 5790 + }, + { + "epoch": 0.37226915701895846, + "grad_norm": 1.855116367340088, + "learning_rate": 1.297261454810689e-05, + "loss": 0.0434, + "step": 5800 + }, + { + "epoch": 0.3729110003931291, + "grad_norm": 0.612144947052002, + "learning_rate": 1.29593528280618e-05, + "loss": 0.0361, + "step": 5810 + }, + { + "epoch": 0.3735528437672997, + "grad_norm": 2.8559155464172363, + "learning_rate": 1.2946091108016711e-05, + "loss": 0.0387, + "step": 5820 + }, + { + "epoch": 0.3741946871414703, + "grad_norm": 0.6401782631874084, + "learning_rate": 1.2932829387971622e-05, + "loss": 0.0418, + "step": 5830 + }, + { + "epoch": 0.3748365305156409, + "grad_norm": 2.0480880737304688, + "learning_rate": 1.2919567667926532e-05, + "loss": 0.0366, + "step": 5840 + }, + { + "epoch": 0.37547837388981153, + "grad_norm": 0.23915986716747284, + "learning_rate": 1.2906305947881441e-05, + "loss": 0.0304, + "step": 5850 + }, + { + "epoch": 0.37612021726398215, + "grad_norm": 1.4414054155349731, + "learning_rate": 1.289304422783635e-05, + "loss": 0.0254, + "step": 5860 + }, + { + "epoch": 0.37676206063815276, + "grad_norm": 1.5166568756103516, + "learning_rate": 1.287978250779126e-05, + "loss": 0.05, + "step": 5870 + }, + { + "epoch": 0.3774039040123234, + "grad_norm": 1.248548984527588, + "learning_rate": 1.2866520787746171e-05, + "loss": 0.0259, + "step": 5880 + }, + { + "epoch": 0.378045747386494, + "grad_norm": 1.0585907697677612, + "learning_rate": 1.2853259067701082e-05, + "loss": 0.0313, + "step": 5890 + }, + { + "epoch": 0.3786875907606646, + "grad_norm": 3.518057346343994, + "learning_rate": 1.2839997347655992e-05, + "loss": 0.0359, + "step": 5900 + }, + { + "epoch": 0.3793294341348352, + "grad_norm": 5.588145732879639, + "learning_rate": 1.2826735627610901e-05, + "loss": 0.0443, + "step": 5910 + }, + { + "epoch": 0.3799712775090059, + "grad_norm": 3.4836244583129883, + "learning_rate": 1.2813473907565813e-05, + "loss": 0.0302, + "step": 5920 + }, + { + "epoch": 0.3806131208831765, + "grad_norm": 0.4826532006263733, + "learning_rate": 1.2800212187520722e-05, + "loss": 0.0215, + "step": 5930 + }, + { + "epoch": 0.3812549642573471, + "grad_norm": 1.2696845531463623, + "learning_rate": 1.2786950467475632e-05, + "loss": 0.0239, + "step": 5940 + }, + { + "epoch": 0.38189680763151773, + "grad_norm": 1.2803621292114258, + "learning_rate": 1.2773688747430544e-05, + "loss": 0.0291, + "step": 5950 + }, + { + "epoch": 0.38253865100568835, + "grad_norm": 2.4219958782196045, + "learning_rate": 1.2760427027385454e-05, + "loss": 0.0381, + "step": 5960 + }, + { + "epoch": 0.38318049437985896, + "grad_norm": 3.6280503273010254, + "learning_rate": 1.2747165307340363e-05, + "loss": 0.0316, + "step": 5970 + }, + { + "epoch": 0.3838223377540296, + "grad_norm": 1.5109280347824097, + "learning_rate": 1.2733903587295273e-05, + "loss": 0.0226, + "step": 5980 + }, + { + "epoch": 0.3844641811282002, + "grad_norm": 1.0172524452209473, + "learning_rate": 1.2720641867250182e-05, + "loss": 0.0219, + "step": 5990 + }, + { + "epoch": 0.3851060245023708, + "grad_norm": 1.4693063497543335, + "learning_rate": 1.2707380147205094e-05, + "loss": 0.023, + "step": 6000 + }, + { + "epoch": 0.3857478678765414, + "grad_norm": 1.2674403190612793, + "learning_rate": 1.2694118427160003e-05, + "loss": 0.0265, + "step": 6010 + }, + { + "epoch": 0.38638971125071203, + "grad_norm": 1.6904219388961792, + "learning_rate": 1.2680856707114914e-05, + "loss": 0.0252, + "step": 6020 + }, + { + "epoch": 0.38703155462488265, + "grad_norm": 5.279049873352051, + "learning_rate": 1.2667594987069824e-05, + "loss": 0.0386, + "step": 6030 + }, + { + "epoch": 0.38767339799905326, + "grad_norm": 0.1736469864845276, + "learning_rate": 1.2654333267024735e-05, + "loss": 0.0279, + "step": 6040 + }, + { + "epoch": 0.3883152413732239, + "grad_norm": 0.937554657459259, + "learning_rate": 1.2641071546979644e-05, + "loss": 0.0373, + "step": 6050 + }, + { + "epoch": 0.38895708474739454, + "grad_norm": 5.91543436050415, + "learning_rate": 1.2627809826934554e-05, + "loss": 0.042, + "step": 6060 + }, + { + "epoch": 0.38959892812156516, + "grad_norm": 2.749023675918579, + "learning_rate": 1.2614548106889463e-05, + "loss": 0.0364, + "step": 6070 + }, + { + "epoch": 0.39024077149573577, + "grad_norm": 2.4227447509765625, + "learning_rate": 1.2601286386844376e-05, + "loss": 0.0218, + "step": 6080 + }, + { + "epoch": 0.3908826148699064, + "grad_norm": 0.1743806004524231, + "learning_rate": 1.2588024666799286e-05, + "loss": 0.0462, + "step": 6090 + }, + { + "epoch": 0.391524458244077, + "grad_norm": 4.252072811126709, + "learning_rate": 1.2574762946754195e-05, + "loss": 0.0476, + "step": 6100 + }, + { + "epoch": 0.3921663016182476, + "grad_norm": 1.6621201038360596, + "learning_rate": 1.2561501226709105e-05, + "loss": 0.0275, + "step": 6110 + }, + { + "epoch": 0.39280814499241823, + "grad_norm": 1.6763057708740234, + "learning_rate": 1.2548239506664016e-05, + "loss": 0.0312, + "step": 6120 + }, + { + "epoch": 0.39344998836658884, + "grad_norm": 1.4106769561767578, + "learning_rate": 1.2534977786618925e-05, + "loss": 0.0168, + "step": 6130 + }, + { + "epoch": 0.39409183174075946, + "grad_norm": 1.142337679862976, + "learning_rate": 1.2521716066573835e-05, + "loss": 0.0296, + "step": 6140 + }, + { + "epoch": 0.39473367511493007, + "grad_norm": 1.2936738729476929, + "learning_rate": 1.2508454346528746e-05, + "loss": 0.0378, + "step": 6150 + }, + { + "epoch": 0.3953755184891007, + "grad_norm": 1.678970456123352, + "learning_rate": 1.2495192626483657e-05, + "loss": 0.0393, + "step": 6160 + }, + { + "epoch": 0.3960173618632713, + "grad_norm": 0.9166494011878967, + "learning_rate": 1.2481930906438567e-05, + "loss": 0.0265, + "step": 6170 + }, + { + "epoch": 0.3966592052374419, + "grad_norm": 0.7071855068206787, + "learning_rate": 1.2468669186393476e-05, + "loss": 0.0177, + "step": 6180 + }, + { + "epoch": 0.39730104861161253, + "grad_norm": 1.9155601263046265, + "learning_rate": 1.2455407466348385e-05, + "loss": 0.0231, + "step": 6190 + }, + { + "epoch": 0.39794289198578314, + "grad_norm": 3.443380832672119, + "learning_rate": 1.2442145746303295e-05, + "loss": 0.0311, + "step": 6200 + }, + { + "epoch": 0.3985847353599538, + "grad_norm": 3.7133545875549316, + "learning_rate": 1.2428884026258208e-05, + "loss": 0.0298, + "step": 6210 + }, + { + "epoch": 0.3992265787341244, + "grad_norm": 0.8525023460388184, + "learning_rate": 1.2415622306213117e-05, + "loss": 0.0372, + "step": 6220 + }, + { + "epoch": 0.39986842210829504, + "grad_norm": 2.423804759979248, + "learning_rate": 1.2402360586168027e-05, + "loss": 0.0219, + "step": 6230 + }, + { + "epoch": 0.40051026548246565, + "grad_norm": 0.8703656792640686, + "learning_rate": 1.2389098866122938e-05, + "loss": 0.0161, + "step": 6240 + }, + { + "epoch": 0.40115210885663627, + "grad_norm": 4.453926086425781, + "learning_rate": 1.2375837146077847e-05, + "loss": 0.0337, + "step": 6250 + }, + { + "epoch": 0.4017939522308069, + "grad_norm": 1.66751229763031, + "learning_rate": 1.2362575426032757e-05, + "loss": 0.0248, + "step": 6260 + }, + { + "epoch": 0.4024357956049775, + "grad_norm": 3.351344108581543, + "learning_rate": 1.2349313705987666e-05, + "loss": 0.0435, + "step": 6270 + }, + { + "epoch": 0.4030776389791481, + "grad_norm": 6.344804286956787, + "learning_rate": 1.233605198594258e-05, + "loss": 0.0325, + "step": 6280 + }, + { + "epoch": 0.4037194823533187, + "grad_norm": 2.798166513442993, + "learning_rate": 1.2322790265897489e-05, + "loss": 0.0413, + "step": 6290 + }, + { + "epoch": 0.40436132572748934, + "grad_norm": 1.518452763557434, + "learning_rate": 1.2309528545852398e-05, + "loss": 0.0467, + "step": 6300 + }, + { + "epoch": 0.40500316910165995, + "grad_norm": 0.12372729927301407, + "learning_rate": 1.2296266825807308e-05, + "loss": 0.0333, + "step": 6310 + }, + { + "epoch": 0.40564501247583057, + "grad_norm": 3.486853837966919, + "learning_rate": 1.2283005105762217e-05, + "loss": 0.0283, + "step": 6320 + }, + { + "epoch": 0.4062868558500012, + "grad_norm": 1.3288218975067139, + "learning_rate": 1.2269743385717128e-05, + "loss": 0.0201, + "step": 6330 + }, + { + "epoch": 0.4069286992241718, + "grad_norm": 1.4961128234863281, + "learning_rate": 1.2256481665672038e-05, + "loss": 0.0276, + "step": 6340 + }, + { + "epoch": 0.40757054259834247, + "grad_norm": 0.3987443447113037, + "learning_rate": 1.2243219945626949e-05, + "loss": 0.0363, + "step": 6350 + }, + { + "epoch": 0.4082123859725131, + "grad_norm": 0.6574511528015137, + "learning_rate": 1.222995822558186e-05, + "loss": 0.0355, + "step": 6360 + }, + { + "epoch": 0.4088542293466837, + "grad_norm": 5.984050750732422, + "learning_rate": 1.221669650553677e-05, + "loss": 0.028, + "step": 6370 + }, + { + "epoch": 0.4094960727208543, + "grad_norm": 0.4951924681663513, + "learning_rate": 1.2203434785491679e-05, + "loss": 0.0343, + "step": 6380 + }, + { + "epoch": 0.4101379160950249, + "grad_norm": 0.14430026710033417, + "learning_rate": 1.2190173065446589e-05, + "loss": 0.0275, + "step": 6390 + }, + { + "epoch": 0.41077975946919554, + "grad_norm": 1.4995652437210083, + "learning_rate": 1.2176911345401498e-05, + "loss": 0.0282, + "step": 6400 + }, + { + "epoch": 0.41142160284336615, + "grad_norm": 0.6720727682113647, + "learning_rate": 1.2163649625356411e-05, + "loss": 0.019, + "step": 6410 + }, + { + "epoch": 0.41206344621753677, + "grad_norm": 3.248645544052124, + "learning_rate": 1.215038790531132e-05, + "loss": 0.0251, + "step": 6420 + }, + { + "epoch": 0.4127052895917074, + "grad_norm": 0.5490819215774536, + "learning_rate": 1.213712618526623e-05, + "loss": 0.0496, + "step": 6430 + }, + { + "epoch": 0.413347132965878, + "grad_norm": 1.7990186214447021, + "learning_rate": 1.212386446522114e-05, + "loss": 0.0496, + "step": 6440 + }, + { + "epoch": 0.4139889763400486, + "grad_norm": 0.8325052261352539, + "learning_rate": 1.211060274517605e-05, + "loss": 0.0334, + "step": 6450 + }, + { + "epoch": 0.4146308197142192, + "grad_norm": 2.4705615043640137, + "learning_rate": 1.209734102513096e-05, + "loss": 0.0259, + "step": 6460 + }, + { + "epoch": 0.41527266308838984, + "grad_norm": 3.4971210956573486, + "learning_rate": 1.208407930508587e-05, + "loss": 0.024, + "step": 6470 + }, + { + "epoch": 0.41591450646256045, + "grad_norm": 0.32653504610061646, + "learning_rate": 1.2070817585040782e-05, + "loss": 0.0478, + "step": 6480 + }, + { + "epoch": 0.41655634983673107, + "grad_norm": 0.8252073526382446, + "learning_rate": 1.2057555864995692e-05, + "loss": 0.0242, + "step": 6490 + }, + { + "epoch": 0.41719819321090174, + "grad_norm": 0.4332980513572693, + "learning_rate": 1.2044294144950601e-05, + "loss": 0.0168, + "step": 6500 + }, + { + "epoch": 0.41784003658507235, + "grad_norm": 1.0085843801498413, + "learning_rate": 1.203103242490551e-05, + "loss": 0.0286, + "step": 6510 + }, + { + "epoch": 0.41848187995924296, + "grad_norm": 0.031135300174355507, + "learning_rate": 1.201777070486042e-05, + "loss": 0.0331, + "step": 6520 + }, + { + "epoch": 0.4191237233334136, + "grad_norm": 1.3585585355758667, + "learning_rate": 1.2004508984815331e-05, + "loss": 0.0313, + "step": 6530 + }, + { + "epoch": 0.4197655667075842, + "grad_norm": 0.3975318968296051, + "learning_rate": 1.1991247264770243e-05, + "loss": 0.0424, + "step": 6540 + }, + { + "epoch": 0.4204074100817548, + "grad_norm": 1.7473927736282349, + "learning_rate": 1.1977985544725152e-05, + "loss": 0.0311, + "step": 6550 + }, + { + "epoch": 0.4210492534559254, + "grad_norm": 0.8991397023200989, + "learning_rate": 1.1964723824680062e-05, + "loss": 0.0262, + "step": 6560 + }, + { + "epoch": 0.42169109683009603, + "grad_norm": 0.20126798748970032, + "learning_rate": 1.1951462104634973e-05, + "loss": 0.0364, + "step": 6570 + }, + { + "epoch": 0.42233294020426665, + "grad_norm": 1.0025629997253418, + "learning_rate": 1.1938200384589882e-05, + "loss": 0.0209, + "step": 6580 + }, + { + "epoch": 0.42297478357843726, + "grad_norm": 1.4650754928588867, + "learning_rate": 1.1924938664544792e-05, + "loss": 0.0331, + "step": 6590 + }, + { + "epoch": 0.4236166269526079, + "grad_norm": 0.48315685987472534, + "learning_rate": 1.1911676944499701e-05, + "loss": 0.0347, + "step": 6600 + }, + { + "epoch": 0.4242584703267785, + "grad_norm": 6.905628681182861, + "learning_rate": 1.1898415224454614e-05, + "loss": 0.03, + "step": 6610 + }, + { + "epoch": 0.4249003137009491, + "grad_norm": 3.8432912826538086, + "learning_rate": 1.1885153504409524e-05, + "loss": 0.0229, + "step": 6620 + }, + { + "epoch": 0.4255421570751197, + "grad_norm": 0.7360489964485168, + "learning_rate": 1.1871891784364433e-05, + "loss": 0.0254, + "step": 6630 + }, + { + "epoch": 0.42618400044929033, + "grad_norm": 0.9641789793968201, + "learning_rate": 1.1858630064319342e-05, + "loss": 0.0311, + "step": 6640 + }, + { + "epoch": 0.426825843823461, + "grad_norm": 0.14324475824832916, + "learning_rate": 1.1845368344274254e-05, + "loss": 0.0276, + "step": 6650 + }, + { + "epoch": 0.4274676871976316, + "grad_norm": 0.7097848653793335, + "learning_rate": 1.1832106624229163e-05, + "loss": 0.0229, + "step": 6660 + }, + { + "epoch": 0.42810953057180223, + "grad_norm": 5.492007255554199, + "learning_rate": 1.1818844904184074e-05, + "loss": 0.0476, + "step": 6670 + }, + { + "epoch": 0.42875137394597285, + "grad_norm": 3.966278076171875, + "learning_rate": 1.1805583184138984e-05, + "loss": 0.0393, + "step": 6680 + }, + { + "epoch": 0.42939321732014346, + "grad_norm": 0.7945524454116821, + "learning_rate": 1.1792321464093895e-05, + "loss": 0.0422, + "step": 6690 + }, + { + "epoch": 0.4300350606943141, + "grad_norm": 0.11064202338457108, + "learning_rate": 1.1779059744048804e-05, + "loss": 0.0339, + "step": 6700 + }, + { + "epoch": 0.4306769040684847, + "grad_norm": 0.8005304336547852, + "learning_rate": 1.1765798024003714e-05, + "loss": 0.0253, + "step": 6710 + }, + { + "epoch": 0.4313187474426553, + "grad_norm": 1.859842300415039, + "learning_rate": 1.1752536303958623e-05, + "loss": 0.0226, + "step": 6720 + }, + { + "epoch": 0.4319605908168259, + "grad_norm": 1.9041091203689575, + "learning_rate": 1.1739274583913533e-05, + "loss": 0.041, + "step": 6730 + }, + { + "epoch": 0.43260243419099653, + "grad_norm": 0.13686195015907288, + "learning_rate": 1.1726012863868446e-05, + "loss": 0.0236, + "step": 6740 + }, + { + "epoch": 0.43324427756516715, + "grad_norm": 0.42238569259643555, + "learning_rate": 1.1712751143823355e-05, + "loss": 0.0488, + "step": 6750 + }, + { + "epoch": 0.43388612093933776, + "grad_norm": 0.4500075578689575, + "learning_rate": 1.1699489423778265e-05, + "loss": 0.0202, + "step": 6760 + }, + { + "epoch": 0.4345279643135084, + "grad_norm": 0.783301055431366, + "learning_rate": 1.1686227703733176e-05, + "loss": 0.0174, + "step": 6770 + }, + { + "epoch": 0.435169807687679, + "grad_norm": 0.05518011748790741, + "learning_rate": 1.1672965983688085e-05, + "loss": 0.0476, + "step": 6780 + }, + { + "epoch": 0.43581165106184966, + "grad_norm": 0.25636357069015503, + "learning_rate": 1.1659704263642995e-05, + "loss": 0.0358, + "step": 6790 + }, + { + "epoch": 0.4364534944360203, + "grad_norm": 0.5332306623458862, + "learning_rate": 1.1646442543597904e-05, + "loss": 0.0298, + "step": 6800 + }, + { + "epoch": 0.4370953378101909, + "grad_norm": 6.301130771636963, + "learning_rate": 1.1633180823552817e-05, + "loss": 0.0344, + "step": 6810 + }, + { + "epoch": 0.4377371811843615, + "grad_norm": 1.3313417434692383, + "learning_rate": 1.1619919103507727e-05, + "loss": 0.026, + "step": 6820 + }, + { + "epoch": 0.4383790245585321, + "grad_norm": 2.344592332839966, + "learning_rate": 1.1606657383462636e-05, + "loss": 0.0378, + "step": 6830 + }, + { + "epoch": 0.43902086793270273, + "grad_norm": 5.8374857902526855, + "learning_rate": 1.1593395663417546e-05, + "loss": 0.0327, + "step": 6840 + }, + { + "epoch": 0.43966271130687334, + "grad_norm": 1.945421814918518, + "learning_rate": 1.1580133943372455e-05, + "loss": 0.0367, + "step": 6850 + }, + { + "epoch": 0.44030455468104396, + "grad_norm": 1.068772315979004, + "learning_rate": 1.1566872223327366e-05, + "loss": 0.0215, + "step": 6860 + }, + { + "epoch": 0.4409463980552146, + "grad_norm": 0.36997637152671814, + "learning_rate": 1.1553610503282277e-05, + "loss": 0.0323, + "step": 6870 + }, + { + "epoch": 0.4415882414293852, + "grad_norm": 0.7297510504722595, + "learning_rate": 1.1540348783237187e-05, + "loss": 0.0177, + "step": 6880 + }, + { + "epoch": 0.4422300848035558, + "grad_norm": 0.4879355728626251, + "learning_rate": 1.1527087063192098e-05, + "loss": 0.0347, + "step": 6890 + }, + { + "epoch": 0.4428719281777264, + "grad_norm": 2.4448482990264893, + "learning_rate": 1.1513825343147008e-05, + "loss": 0.0291, + "step": 6900 + }, + { + "epoch": 0.44351377155189703, + "grad_norm": 0.1498849242925644, + "learning_rate": 1.1500563623101917e-05, + "loss": 0.0154, + "step": 6910 + }, + { + "epoch": 0.44415561492606764, + "grad_norm": 0.27776220440864563, + "learning_rate": 1.1487301903056827e-05, + "loss": 0.0306, + "step": 6920 + }, + { + "epoch": 0.44479745830023826, + "grad_norm": 0.4810338020324707, + "learning_rate": 1.1474040183011736e-05, + "loss": 0.0216, + "step": 6930 + }, + { + "epoch": 0.4454393016744089, + "grad_norm": 0.4320123493671417, + "learning_rate": 1.1460778462966649e-05, + "loss": 0.0418, + "step": 6940 + }, + { + "epoch": 0.44608114504857954, + "grad_norm": 1.5858159065246582, + "learning_rate": 1.1447516742921558e-05, + "loss": 0.0277, + "step": 6950 + }, + { + "epoch": 0.44672298842275016, + "grad_norm": 1.1560159921646118, + "learning_rate": 1.1434255022876468e-05, + "loss": 0.0232, + "step": 6960 + }, + { + "epoch": 0.44736483179692077, + "grad_norm": 0.48158320784568787, + "learning_rate": 1.1420993302831377e-05, + "loss": 0.0308, + "step": 6970 + }, + { + "epoch": 0.4480066751710914, + "grad_norm": 0.22836768627166748, + "learning_rate": 1.1407731582786288e-05, + "loss": 0.0287, + "step": 6980 + }, + { + "epoch": 0.448648518545262, + "grad_norm": 0.2631066143512726, + "learning_rate": 1.1394469862741198e-05, + "loss": 0.0289, + "step": 6990 + }, + { + "epoch": 0.4492903619194326, + "grad_norm": 3.866243362426758, + "learning_rate": 1.1381208142696109e-05, + "loss": 0.0442, + "step": 7000 + }, + { + "epoch": 0.4499322052936032, + "grad_norm": 2.3988699913024902, + "learning_rate": 1.1367946422651019e-05, + "loss": 0.0123, + "step": 7010 + }, + { + "epoch": 0.45057404866777384, + "grad_norm": 3.825685739517212, + "learning_rate": 1.135468470260593e-05, + "loss": 0.0354, + "step": 7020 + }, + { + "epoch": 0.45121589204194446, + "grad_norm": 5.321695804595947, + "learning_rate": 1.134142298256084e-05, + "loss": 0.0204, + "step": 7030 + }, + { + "epoch": 0.45185773541611507, + "grad_norm": 3.1179020404815674, + "learning_rate": 1.1328161262515749e-05, + "loss": 0.0323, + "step": 7040 + }, + { + "epoch": 0.4524995787902857, + "grad_norm": 4.537142276763916, + "learning_rate": 1.1314899542470658e-05, + "loss": 0.027, + "step": 7050 + }, + { + "epoch": 0.4531414221644563, + "grad_norm": 0.16674059629440308, + "learning_rate": 1.1301637822425568e-05, + "loss": 0.0271, + "step": 7060 + }, + { + "epoch": 0.4537832655386269, + "grad_norm": 0.692891001701355, + "learning_rate": 1.128837610238048e-05, + "loss": 0.0207, + "step": 7070 + }, + { + "epoch": 0.4544251089127976, + "grad_norm": 1.5163105726242065, + "learning_rate": 1.127511438233539e-05, + "loss": 0.0339, + "step": 7080 + }, + { + "epoch": 0.4550669522869682, + "grad_norm": 0.9091341495513916, + "learning_rate": 1.12618526622903e-05, + "loss": 0.0243, + "step": 7090 + }, + { + "epoch": 0.4557087956611388, + "grad_norm": 2.6304235458374023, + "learning_rate": 1.124859094224521e-05, + "loss": 0.024, + "step": 7100 + }, + { + "epoch": 0.4563506390353094, + "grad_norm": 5.431898593902588, + "learning_rate": 1.123532922220012e-05, + "loss": 0.0479, + "step": 7110 + }, + { + "epoch": 0.45699248240948004, + "grad_norm": 0.8807118535041809, + "learning_rate": 1.122206750215503e-05, + "loss": 0.0181, + "step": 7120 + }, + { + "epoch": 0.45763432578365065, + "grad_norm": 1.353542447090149, + "learning_rate": 1.120880578210994e-05, + "loss": 0.0156, + "step": 7130 + }, + { + "epoch": 0.45827616915782127, + "grad_norm": 0.13024044036865234, + "learning_rate": 1.1195544062064852e-05, + "loss": 0.0222, + "step": 7140 + }, + { + "epoch": 0.4589180125319919, + "grad_norm": 0.5227543711662292, + "learning_rate": 1.1182282342019761e-05, + "loss": 0.0295, + "step": 7150 + }, + { + "epoch": 0.4595598559061625, + "grad_norm": 4.043930530548096, + "learning_rate": 1.1169020621974671e-05, + "loss": 0.0254, + "step": 7160 + }, + { + "epoch": 0.4602016992803331, + "grad_norm": 1.1538896560668945, + "learning_rate": 1.115575890192958e-05, + "loss": 0.0218, + "step": 7170 + }, + { + "epoch": 0.4608435426545037, + "grad_norm": 1.619653344154358, + "learning_rate": 1.114249718188449e-05, + "loss": 0.0327, + "step": 7180 + }, + { + "epoch": 0.46148538602867434, + "grad_norm": 2.511518955230713, + "learning_rate": 1.1129235461839401e-05, + "loss": 0.0326, + "step": 7190 + }, + { + "epoch": 0.46212722940284495, + "grad_norm": 0.5349030494689941, + "learning_rate": 1.1115973741794312e-05, + "loss": 0.0213, + "step": 7200 + }, + { + "epoch": 0.46276907277701557, + "grad_norm": 1.1988285779953003, + "learning_rate": 1.1102712021749222e-05, + "loss": 0.0281, + "step": 7210 + }, + { + "epoch": 0.4634109161511862, + "grad_norm": 0.8555197715759277, + "learning_rate": 1.1089450301704133e-05, + "loss": 0.0349, + "step": 7220 + }, + { + "epoch": 0.46405275952535685, + "grad_norm": 1.372574806213379, + "learning_rate": 1.1076188581659042e-05, + "loss": 0.0395, + "step": 7230 + }, + { + "epoch": 0.46469460289952746, + "grad_norm": 0.7914135456085205, + "learning_rate": 1.1062926861613952e-05, + "loss": 0.0234, + "step": 7240 + }, + { + "epoch": 0.4653364462736981, + "grad_norm": 0.6850894093513489, + "learning_rate": 1.1049665141568861e-05, + "loss": 0.0229, + "step": 7250 + }, + { + "epoch": 0.4659782896478687, + "grad_norm": 0.6751859188079834, + "learning_rate": 1.1036403421523774e-05, + "loss": 0.0317, + "step": 7260 + }, + { + "epoch": 0.4666201330220393, + "grad_norm": 0.9388216137886047, + "learning_rate": 1.1023141701478684e-05, + "loss": 0.0339, + "step": 7270 + }, + { + "epoch": 0.4672619763962099, + "grad_norm": 0.6718018651008606, + "learning_rate": 1.1009879981433593e-05, + "loss": 0.0233, + "step": 7280 + }, + { + "epoch": 0.46790381977038054, + "grad_norm": 0.6536454558372498, + "learning_rate": 1.0996618261388503e-05, + "loss": 0.028, + "step": 7290 + }, + { + "epoch": 0.46854566314455115, + "grad_norm": 0.9818311333656311, + "learning_rate": 1.0983356541343412e-05, + "loss": 0.0244, + "step": 7300 + }, + { + "epoch": 0.46918750651872176, + "grad_norm": 4.402950286865234, + "learning_rate": 1.0970094821298323e-05, + "loss": 0.0205, + "step": 7310 + }, + { + "epoch": 0.4698293498928924, + "grad_norm": 1.2141329050064087, + "learning_rate": 1.0956833101253233e-05, + "loss": 0.0237, + "step": 7320 + }, + { + "epoch": 0.470471193267063, + "grad_norm": 2.2326622009277344, + "learning_rate": 1.0943571381208144e-05, + "loss": 0.0254, + "step": 7330 + }, + { + "epoch": 0.4711130366412336, + "grad_norm": 6.558351039886475, + "learning_rate": 1.0930309661163055e-05, + "loss": 0.0466, + "step": 7340 + }, + { + "epoch": 0.4717548800154042, + "grad_norm": 0.7890551686286926, + "learning_rate": 1.0917047941117965e-05, + "loss": 0.036, + "step": 7350 + }, + { + "epoch": 0.47239672338957484, + "grad_norm": 5.157413005828857, + "learning_rate": 1.0903786221072874e-05, + "loss": 0.012, + "step": 7360 + }, + { + "epoch": 0.47303856676374545, + "grad_norm": 0.047313038259744644, + "learning_rate": 1.0890524501027784e-05, + "loss": 0.0325, + "step": 7370 + }, + { + "epoch": 0.4736804101379161, + "grad_norm": 1.082221269607544, + "learning_rate": 1.0877262780982693e-05, + "loss": 0.0236, + "step": 7380 + }, + { + "epoch": 0.47432225351208673, + "grad_norm": 1.6152721643447876, + "learning_rate": 1.0864001060937604e-05, + "loss": 0.0393, + "step": 7390 + }, + { + "epoch": 0.47496409688625735, + "grad_norm": 3.0274405479431152, + "learning_rate": 1.0850739340892515e-05, + "loss": 0.0323, + "step": 7400 + }, + { + "epoch": 0.47560594026042796, + "grad_norm": 0.7774971127510071, + "learning_rate": 1.0837477620847425e-05, + "loss": 0.0381, + "step": 7410 + }, + { + "epoch": 0.4762477836345986, + "grad_norm": 0.2556317448616028, + "learning_rate": 1.0824215900802334e-05, + "loss": 0.0545, + "step": 7420 + }, + { + "epoch": 0.4768896270087692, + "grad_norm": 1.066426396369934, + "learning_rate": 1.0810954180757245e-05, + "loss": 0.025, + "step": 7430 + }, + { + "epoch": 0.4775314703829398, + "grad_norm": 3.27673602104187, + "learning_rate": 1.0797692460712155e-05, + "loss": 0.0275, + "step": 7440 + }, + { + "epoch": 0.4781733137571104, + "grad_norm": 2.6178624629974365, + "learning_rate": 1.0784430740667064e-05, + "loss": 0.02, + "step": 7450 + }, + { + "epoch": 0.47881515713128103, + "grad_norm": 0.1021733283996582, + "learning_rate": 1.0771169020621977e-05, + "loss": 0.0222, + "step": 7460 + }, + { + "epoch": 0.47945700050545165, + "grad_norm": 2.5011208057403564, + "learning_rate": 1.0757907300576887e-05, + "loss": 0.0293, + "step": 7470 + }, + { + "epoch": 0.48009884387962226, + "grad_norm": 0.8056000471115112, + "learning_rate": 1.0744645580531796e-05, + "loss": 0.013, + "step": 7480 + }, + { + "epoch": 0.4807406872537929, + "grad_norm": 1.466461420059204, + "learning_rate": 1.0731383860486706e-05, + "loss": 0.0137, + "step": 7490 + }, + { + "epoch": 0.4813825306279635, + "grad_norm": 1.2518103122711182, + "learning_rate": 1.0718122140441615e-05, + "loss": 0.0241, + "step": 7500 + }, + { + "epoch": 0.4820243740021341, + "grad_norm": 0.7515360713005066, + "learning_rate": 1.0704860420396526e-05, + "loss": 0.0181, + "step": 7510 + }, + { + "epoch": 0.4826662173763048, + "grad_norm": 1.9555808305740356, + "learning_rate": 1.0691598700351436e-05, + "loss": 0.0119, + "step": 7520 + }, + { + "epoch": 0.4833080607504754, + "grad_norm": 2.6431872844696045, + "learning_rate": 1.0678336980306347e-05, + "loss": 0.0209, + "step": 7530 + }, + { + "epoch": 0.483949904124646, + "grad_norm": 4.683318614959717, + "learning_rate": 1.0665075260261257e-05, + "loss": 0.0312, + "step": 7540 + }, + { + "epoch": 0.4845917474988166, + "grad_norm": 0.357537180185318, + "learning_rate": 1.0651813540216168e-05, + "loss": 0.0335, + "step": 7550 + }, + { + "epoch": 0.48523359087298723, + "grad_norm": 0.3757706880569458, + "learning_rate": 1.0638551820171077e-05, + "loss": 0.0294, + "step": 7560 + }, + { + "epoch": 0.48587543424715784, + "grad_norm": 2.468076467514038, + "learning_rate": 1.0625290100125987e-05, + "loss": 0.0131, + "step": 7570 + }, + { + "epoch": 0.48651727762132846, + "grad_norm": 0.47705045342445374, + "learning_rate": 1.0612028380080896e-05, + "loss": 0.0195, + "step": 7580 + }, + { + "epoch": 0.4871591209954991, + "grad_norm": 0.9728513360023499, + "learning_rate": 1.0598766660035809e-05, + "loss": 0.0446, + "step": 7590 + }, + { + "epoch": 0.4878009643696697, + "grad_norm": 1.766430139541626, + "learning_rate": 1.0585504939990718e-05, + "loss": 0.0255, + "step": 7600 + }, + { + "epoch": 0.4884428077438403, + "grad_norm": 0.12747463583946228, + "learning_rate": 1.0572243219945628e-05, + "loss": 0.0222, + "step": 7610 + }, + { + "epoch": 0.4890846511180109, + "grad_norm": 0.43220123648643494, + "learning_rate": 1.0558981499900537e-05, + "loss": 0.0291, + "step": 7620 + }, + { + "epoch": 0.48972649449218153, + "grad_norm": 0.8481399416923523, + "learning_rate": 1.0545719779855449e-05, + "loss": 0.0131, + "step": 7630 + }, + { + "epoch": 0.49036833786635214, + "grad_norm": 0.293948769569397, + "learning_rate": 1.0532458059810358e-05, + "loss": 0.042, + "step": 7640 + }, + { + "epoch": 0.49101018124052276, + "grad_norm": 0.6220739483833313, + "learning_rate": 1.0519196339765268e-05, + "loss": 0.0227, + "step": 7650 + }, + { + "epoch": 0.4916520246146934, + "grad_norm": 0.4679778814315796, + "learning_rate": 1.0505934619720179e-05, + "loss": 0.0212, + "step": 7660 + }, + { + "epoch": 0.49229386798886404, + "grad_norm": 1.4122891426086426, + "learning_rate": 1.049267289967509e-05, + "loss": 0.0365, + "step": 7670 + }, + { + "epoch": 0.49293571136303466, + "grad_norm": 0.41941073536872864, + "learning_rate": 1.047941117963e-05, + "loss": 0.0417, + "step": 7680 + }, + { + "epoch": 0.49357755473720527, + "grad_norm": 0.9413459300994873, + "learning_rate": 1.0466149459584909e-05, + "loss": 0.0349, + "step": 7690 + }, + { + "epoch": 0.4942193981113759, + "grad_norm": 0.16081929206848145, + "learning_rate": 1.0452887739539818e-05, + "loss": 0.0355, + "step": 7700 + }, + { + "epoch": 0.4948612414855465, + "grad_norm": 0.12057320773601532, + "learning_rate": 1.0439626019494728e-05, + "loss": 0.0237, + "step": 7710 + }, + { + "epoch": 0.4955030848597171, + "grad_norm": 0.39029595255851746, + "learning_rate": 1.042636429944964e-05, + "loss": 0.0242, + "step": 7720 + }, + { + "epoch": 0.4961449282338877, + "grad_norm": 1.100891351699829, + "learning_rate": 1.041310257940455e-05, + "loss": 0.0207, + "step": 7730 + }, + { + "epoch": 0.49678677160805834, + "grad_norm": 0.3278030455112457, + "learning_rate": 1.039984085935946e-05, + "loss": 0.0255, + "step": 7740 + }, + { + "epoch": 0.49742861498222896, + "grad_norm": 2.3268234729766846, + "learning_rate": 1.038657913931437e-05, + "loss": 0.0242, + "step": 7750 + }, + { + "epoch": 0.49807045835639957, + "grad_norm": 4.574901580810547, + "learning_rate": 1.037331741926928e-05, + "loss": 0.0303, + "step": 7760 + }, + { + "epoch": 0.4987123017305702, + "grad_norm": 0.822098433971405, + "learning_rate": 1.036005569922419e-05, + "loss": 0.0191, + "step": 7770 + }, + { + "epoch": 0.4993541451047408, + "grad_norm": 0.3044750988483429, + "learning_rate": 1.03467939791791e-05, + "loss": 0.0235, + "step": 7780 + }, + { + "epoch": 0.4999959884789114, + "grad_norm": 0.9926797747612, + "learning_rate": 1.0333532259134012e-05, + "loss": 0.0132, + "step": 7790 + }, + { + "epoch": 0.5006378318530821, + "grad_norm": 1.0424890518188477, + "learning_rate": 1.0320270539088922e-05, + "loss": 0.0331, + "step": 7800 + }, + { + "epoch": 0.5012796752272527, + "grad_norm": 0.3462810516357422, + "learning_rate": 1.0307008819043831e-05, + "loss": 0.0198, + "step": 7810 + }, + { + "epoch": 0.5019215186014233, + "grad_norm": 0.24118022620677948, + "learning_rate": 1.029374709899874e-05, + "loss": 0.0217, + "step": 7820 + }, + { + "epoch": 0.5025633619755939, + "grad_norm": 0.46797555685043335, + "learning_rate": 1.028048537895365e-05, + "loss": 0.0366, + "step": 7830 + }, + { + "epoch": 0.5032052053497645, + "grad_norm": 0.6193893551826477, + "learning_rate": 1.0267223658908561e-05, + "loss": 0.0103, + "step": 7840 + }, + { + "epoch": 0.5038470487239352, + "grad_norm": 0.8424329161643982, + "learning_rate": 1.025396193886347e-05, + "loss": 0.0321, + "step": 7850 + }, + { + "epoch": 0.5044888920981058, + "grad_norm": 0.15049193799495697, + "learning_rate": 1.0240700218818382e-05, + "loss": 0.0183, + "step": 7860 + }, + { + "epoch": 0.5051307354722764, + "grad_norm": 5.053986072540283, + "learning_rate": 1.0227438498773293e-05, + "loss": 0.0242, + "step": 7870 + }, + { + "epoch": 0.505772578846447, + "grad_norm": 2.5194997787475586, + "learning_rate": 1.0214176778728202e-05, + "loss": 0.0182, + "step": 7880 + }, + { + "epoch": 0.5064144222206176, + "grad_norm": 0.05712272971868515, + "learning_rate": 1.0200915058683112e-05, + "loss": 0.0207, + "step": 7890 + }, + { + "epoch": 0.5070562655947882, + "grad_norm": 0.5997760891914368, + "learning_rate": 1.0187653338638021e-05, + "loss": 0.0385, + "step": 7900 + }, + { + "epoch": 0.5076981089689588, + "grad_norm": 0.05409073084592819, + "learning_rate": 1.0174391618592931e-05, + "loss": 0.0293, + "step": 7910 + }, + { + "epoch": 0.5083399523431295, + "grad_norm": 0.35536444187164307, + "learning_rate": 1.0161129898547844e-05, + "loss": 0.0201, + "step": 7920 + }, + { + "epoch": 0.5089817957173001, + "grad_norm": 0.0880308747291565, + "learning_rate": 1.0147868178502753e-05, + "loss": 0.0137, + "step": 7930 + }, + { + "epoch": 0.5096236390914707, + "grad_norm": 0.22606980800628662, + "learning_rate": 1.0134606458457663e-05, + "loss": 0.0244, + "step": 7940 + }, + { + "epoch": 0.5102654824656413, + "grad_norm": 0.05226253718137741, + "learning_rate": 1.0121344738412572e-05, + "loss": 0.011, + "step": 7950 + }, + { + "epoch": 0.5109073258398119, + "grad_norm": 0.9334781765937805, + "learning_rate": 1.0108083018367483e-05, + "loss": 0.0314, + "step": 7960 + }, + { + "epoch": 0.5115491692139825, + "grad_norm": 0.446389764547348, + "learning_rate": 1.0094821298322393e-05, + "loss": 0.0238, + "step": 7970 + }, + { + "epoch": 0.5121910125881531, + "grad_norm": 0.25008928775787354, + "learning_rate": 1.0081559578277302e-05, + "loss": 0.0191, + "step": 7980 + }, + { + "epoch": 0.5128328559623238, + "grad_norm": 1.671425461769104, + "learning_rate": 1.0068297858232215e-05, + "loss": 0.0342, + "step": 7990 + }, + { + "epoch": 0.5134746993364944, + "grad_norm": 0.8550741076469421, + "learning_rate": 1.0055036138187125e-05, + "loss": 0.0215, + "step": 8000 + }, + { + "epoch": 0.514116542710665, + "grad_norm": 0.5535662174224854, + "learning_rate": 1.0041774418142034e-05, + "loss": 0.0402, + "step": 8010 + }, + { + "epoch": 0.5147583860848356, + "grad_norm": 0.6770220398902893, + "learning_rate": 1.0028512698096944e-05, + "loss": 0.0194, + "step": 8020 + }, + { + "epoch": 0.5154002294590063, + "grad_norm": 1.0631548166275024, + "learning_rate": 1.0015250978051853e-05, + "loss": 0.0329, + "step": 8030 + }, + { + "epoch": 0.5160420728331769, + "grad_norm": 0.944418728351593, + "learning_rate": 1.0001989258006763e-05, + "loss": 0.0195, + "step": 8040 + }, + { + "epoch": 0.5166839162073475, + "grad_norm": 0.7196710109710693, + "learning_rate": 9.988727537961674e-06, + "loss": 0.0229, + "step": 8050 + }, + { + "epoch": 0.5173257595815182, + "grad_norm": 0.5939407348632812, + "learning_rate": 9.975465817916585e-06, + "loss": 0.0214, + "step": 8060 + }, + { + "epoch": 0.5179676029556888, + "grad_norm": 8.581120491027832, + "learning_rate": 9.962204097871494e-06, + "loss": 0.0299, + "step": 8070 + }, + { + "epoch": 0.5186094463298594, + "grad_norm": 1.4366955757141113, + "learning_rate": 9.948942377826406e-06, + "loss": 0.0222, + "step": 8080 + }, + { + "epoch": 0.51925128970403, + "grad_norm": 0.4379299581050873, + "learning_rate": 9.935680657781315e-06, + "loss": 0.0278, + "step": 8090 + }, + { + "epoch": 0.5198931330782006, + "grad_norm": 0.5662280917167664, + "learning_rate": 9.922418937736226e-06, + "loss": 0.0226, + "step": 8100 + }, + { + "epoch": 0.5205349764523712, + "grad_norm": 0.5420593619346619, + "learning_rate": 9.909157217691136e-06, + "loss": 0.0208, + "step": 8110 + }, + { + "epoch": 0.5211768198265418, + "grad_norm": 5.630989074707031, + "learning_rate": 9.895895497646045e-06, + "loss": 0.0375, + "step": 8120 + }, + { + "epoch": 0.5218186632007125, + "grad_norm": 4.6729230880737305, + "learning_rate": 9.882633777600955e-06, + "loss": 0.0321, + "step": 8130 + }, + { + "epoch": 0.5224605065748831, + "grad_norm": 1.7400848865509033, + "learning_rate": 9.869372057555866e-06, + "loss": 0.0141, + "step": 8140 + }, + { + "epoch": 0.5231023499490537, + "grad_norm": 1.0144647359848022, + "learning_rate": 9.856110337510775e-06, + "loss": 0.0378, + "step": 8150 + }, + { + "epoch": 0.5237441933232243, + "grad_norm": 0.250291645526886, + "learning_rate": 9.842848617465685e-06, + "loss": 0.0197, + "step": 8160 + }, + { + "epoch": 0.5243860366973949, + "grad_norm": 0.33458101749420166, + "learning_rate": 9.829586897420596e-06, + "loss": 0.0201, + "step": 8170 + }, + { + "epoch": 0.5250278800715655, + "grad_norm": 1.0443096160888672, + "learning_rate": 9.816325177375507e-06, + "loss": 0.0222, + "step": 8180 + }, + { + "epoch": 0.5256697234457361, + "grad_norm": 0.8161584734916687, + "learning_rate": 9.803063457330417e-06, + "loss": 0.016, + "step": 8190 + }, + { + "epoch": 0.5263115668199068, + "grad_norm": 0.007811207789927721, + "learning_rate": 9.789801737285328e-06, + "loss": 0.0231, + "step": 8200 + }, + { + "epoch": 0.5269534101940774, + "grad_norm": 1.5142780542373657, + "learning_rate": 9.776540017240237e-06, + "loss": 0.0305, + "step": 8210 + }, + { + "epoch": 0.527595253568248, + "grad_norm": 0.3009796738624573, + "learning_rate": 9.763278297195147e-06, + "loss": 0.018, + "step": 8220 + }, + { + "epoch": 0.5282370969424186, + "grad_norm": 0.023198135197162628, + "learning_rate": 9.750016577150058e-06, + "loss": 0.0345, + "step": 8230 + }, + { + "epoch": 0.5288789403165892, + "grad_norm": 1.20603609085083, + "learning_rate": 9.736754857104967e-06, + "loss": 0.0293, + "step": 8240 + }, + { + "epoch": 0.5295207836907598, + "grad_norm": 1.3458225727081299, + "learning_rate": 9.723493137059877e-06, + "loss": 0.0284, + "step": 8250 + }, + { + "epoch": 0.5301626270649304, + "grad_norm": 1.0462092161178589, + "learning_rate": 9.710231417014786e-06, + "loss": 0.0246, + "step": 8260 + }, + { + "epoch": 0.5308044704391011, + "grad_norm": 0.3493569493293762, + "learning_rate": 9.696969696969698e-06, + "loss": 0.0296, + "step": 8270 + }, + { + "epoch": 0.5314463138132717, + "grad_norm": 0.17700675129890442, + "learning_rate": 9.683707976924607e-06, + "loss": 0.0327, + "step": 8280 + }, + { + "epoch": 0.5320881571874423, + "grad_norm": 0.3968372344970703, + "learning_rate": 9.670446256879518e-06, + "loss": 0.0393, + "step": 8290 + }, + { + "epoch": 0.5327300005616129, + "grad_norm": 0.5520299077033997, + "learning_rate": 9.65718453683443e-06, + "loss": 0.0219, + "step": 8300 + }, + { + "epoch": 0.5333718439357835, + "grad_norm": 3.568875312805176, + "learning_rate": 9.643922816789339e-06, + "loss": 0.0194, + "step": 8310 + }, + { + "epoch": 0.5340136873099542, + "grad_norm": 2.2202658653259277, + "learning_rate": 9.630661096744248e-06, + "loss": 0.0308, + "step": 8320 + }, + { + "epoch": 0.5346555306841249, + "grad_norm": 0.727830708026886, + "learning_rate": 9.61739937669916e-06, + "loss": 0.0281, + "step": 8330 + }, + { + "epoch": 0.5352973740582955, + "grad_norm": 1.5965157747268677, + "learning_rate": 9.604137656654069e-06, + "loss": 0.0304, + "step": 8340 + }, + { + "epoch": 0.5359392174324661, + "grad_norm": 0.17011073231697083, + "learning_rate": 9.590875936608978e-06, + "loss": 0.0461, + "step": 8350 + }, + { + "epoch": 0.5365810608066367, + "grad_norm": 0.8461958169937134, + "learning_rate": 9.577614216563888e-06, + "loss": 0.0254, + "step": 8360 + }, + { + "epoch": 0.5372229041808073, + "grad_norm": 4.257885456085205, + "learning_rate": 9.564352496518799e-06, + "loss": 0.0286, + "step": 8370 + }, + { + "epoch": 0.5378647475549779, + "grad_norm": 0.5877963900566101, + "learning_rate": 9.551090776473709e-06, + "loss": 0.0202, + "step": 8380 + }, + { + "epoch": 0.5385065909291485, + "grad_norm": 0.26890161633491516, + "learning_rate": 9.53782905642862e-06, + "loss": 0.025, + "step": 8390 + }, + { + "epoch": 0.5391484343033192, + "grad_norm": 0.25851166248321533, + "learning_rate": 9.52456733638353e-06, + "loss": 0.0319, + "step": 8400 + }, + { + "epoch": 0.5397902776774898, + "grad_norm": 0.757095992565155, + "learning_rate": 9.51130561633844e-06, + "loss": 0.0299, + "step": 8410 + }, + { + "epoch": 0.5404321210516604, + "grad_norm": 0.4859978258609772, + "learning_rate": 9.49804389629335e-06, + "loss": 0.0251, + "step": 8420 + }, + { + "epoch": 0.541073964425831, + "grad_norm": 1.4098316431045532, + "learning_rate": 9.484782176248261e-06, + "loss": 0.0293, + "step": 8430 + }, + { + "epoch": 0.5417158078000016, + "grad_norm": 1.0507205724716187, + "learning_rate": 9.47152045620317e-06, + "loss": 0.0382, + "step": 8440 + }, + { + "epoch": 0.5423576511741722, + "grad_norm": 1.2821528911590576, + "learning_rate": 9.45825873615808e-06, + "loss": 0.0254, + "step": 8450 + }, + { + "epoch": 0.5429994945483428, + "grad_norm": 3.5782406330108643, + "learning_rate": 9.444997016112991e-06, + "loss": 0.0311, + "step": 8460 + }, + { + "epoch": 0.5436413379225135, + "grad_norm": 1.255115032196045, + "learning_rate": 9.4317352960679e-06, + "loss": 0.0283, + "step": 8470 + }, + { + "epoch": 0.5442831812966841, + "grad_norm": 2.0047855377197266, + "learning_rate": 9.41847357602281e-06, + "loss": 0.0286, + "step": 8480 + }, + { + "epoch": 0.5449250246708547, + "grad_norm": 3.766130208969116, + "learning_rate": 9.405211855977721e-06, + "loss": 0.0249, + "step": 8490 + }, + { + "epoch": 0.5455668680450253, + "grad_norm": 3.5647730827331543, + "learning_rate": 9.39195013593263e-06, + "loss": 0.0328, + "step": 8500 + }, + { + "epoch": 0.5462087114191959, + "grad_norm": 3.9039595127105713, + "learning_rate": 9.378688415887542e-06, + "loss": 0.0239, + "step": 8510 + }, + { + "epoch": 0.5468505547933665, + "grad_norm": 1.170214056968689, + "learning_rate": 9.365426695842451e-06, + "loss": 0.0272, + "step": 8520 + }, + { + "epoch": 0.5474923981675371, + "grad_norm": 0.36973121762275696, + "learning_rate": 9.352164975797363e-06, + "loss": 0.0262, + "step": 8530 + }, + { + "epoch": 0.5481342415417078, + "grad_norm": 0.8599039912223816, + "learning_rate": 9.338903255752272e-06, + "loss": 0.0246, + "step": 8540 + }, + { + "epoch": 0.5487760849158784, + "grad_norm": 0.2630428671836853, + "learning_rate": 9.325641535707182e-06, + "loss": 0.0254, + "step": 8550 + }, + { + "epoch": 0.549417928290049, + "grad_norm": 0.9511770009994507, + "learning_rate": 9.312379815662093e-06, + "loss": 0.0255, + "step": 8560 + }, + { + "epoch": 0.5500597716642196, + "grad_norm": 0.2588956654071808, + "learning_rate": 9.299118095617002e-06, + "loss": 0.0307, + "step": 8570 + }, + { + "epoch": 0.5507016150383902, + "grad_norm": 0.17903345823287964, + "learning_rate": 9.285856375571912e-06, + "loss": 0.0152, + "step": 8580 + }, + { + "epoch": 0.5513434584125608, + "grad_norm": 2.2372515201568604, + "learning_rate": 9.272594655526823e-06, + "loss": 0.0291, + "step": 8590 + }, + { + "epoch": 0.5519853017867314, + "grad_norm": 0.02844194695353508, + "learning_rate": 9.259332935481732e-06, + "loss": 0.017, + "step": 8600 + }, + { + "epoch": 0.5526271451609021, + "grad_norm": 0.294089138507843, + "learning_rate": 9.246071215436644e-06, + "loss": 0.0166, + "step": 8610 + }, + { + "epoch": 0.5532689885350728, + "grad_norm": 0.9256640672683716, + "learning_rate": 9.232809495391553e-06, + "loss": 0.0302, + "step": 8620 + }, + { + "epoch": 0.5539108319092434, + "grad_norm": 0.5491749048233032, + "learning_rate": 9.219547775346464e-06, + "loss": 0.012, + "step": 8630 + }, + { + "epoch": 0.554552675283414, + "grad_norm": 3.515266180038452, + "learning_rate": 9.206286055301374e-06, + "loss": 0.0239, + "step": 8640 + }, + { + "epoch": 0.5551945186575846, + "grad_norm": 3.337519407272339, + "learning_rate": 9.193024335256283e-06, + "loss": 0.0384, + "step": 8650 + }, + { + "epoch": 0.5558363620317552, + "grad_norm": 0.6898011565208435, + "learning_rate": 9.179762615211194e-06, + "loss": 0.0196, + "step": 8660 + }, + { + "epoch": 0.5564782054059259, + "grad_norm": 1.0568803548812866, + "learning_rate": 9.166500895166104e-06, + "loss": 0.0287, + "step": 8670 + }, + { + "epoch": 0.5571200487800965, + "grad_norm": 0.8259192109107971, + "learning_rate": 9.153239175121013e-06, + "loss": 0.0126, + "step": 8680 + }, + { + "epoch": 0.5577618921542671, + "grad_norm": 0.4048657715320587, + "learning_rate": 9.139977455075924e-06, + "loss": 0.0292, + "step": 8690 + }, + { + "epoch": 0.5584037355284377, + "grad_norm": 0.6915555596351624, + "learning_rate": 9.126715735030834e-06, + "loss": 0.0184, + "step": 8700 + }, + { + "epoch": 0.5590455789026083, + "grad_norm": 0.19845692813396454, + "learning_rate": 9.113454014985743e-06, + "loss": 0.0232, + "step": 8710 + }, + { + "epoch": 0.5596874222767789, + "grad_norm": 0.4381246864795685, + "learning_rate": 9.100192294940655e-06, + "loss": 0.0441, + "step": 8720 + }, + { + "epoch": 0.5603292656509495, + "grad_norm": 1.0122863054275513, + "learning_rate": 9.086930574895566e-06, + "loss": 0.0272, + "step": 8730 + }, + { + "epoch": 0.5609711090251202, + "grad_norm": 0.18714962899684906, + "learning_rate": 9.073668854850475e-06, + "loss": 0.017, + "step": 8740 + }, + { + "epoch": 0.5616129523992908, + "grad_norm": 0.34267953038215637, + "learning_rate": 9.060407134805385e-06, + "loss": 0.0257, + "step": 8750 + }, + { + "epoch": 0.5622547957734614, + "grad_norm": 0.4609805643558502, + "learning_rate": 9.047145414760296e-06, + "loss": 0.0256, + "step": 8760 + }, + { + "epoch": 0.562896639147632, + "grad_norm": 0.06432998180389404, + "learning_rate": 9.033883694715205e-06, + "loss": 0.0141, + "step": 8770 + }, + { + "epoch": 0.5635384825218026, + "grad_norm": 0.09321536868810654, + "learning_rate": 9.020621974670115e-06, + "loss": 0.0212, + "step": 8780 + }, + { + "epoch": 0.5641803258959732, + "grad_norm": 7.035615921020508, + "learning_rate": 9.007360254625026e-06, + "loss": 0.0337, + "step": 8790 + }, + { + "epoch": 0.5648221692701438, + "grad_norm": 0.7401273250579834, + "learning_rate": 8.994098534579935e-06, + "loss": 0.0252, + "step": 8800 + }, + { + "epoch": 0.5654640126443145, + "grad_norm": 4.558418273925781, + "learning_rate": 8.980836814534845e-06, + "loss": 0.0336, + "step": 8810 + }, + { + "epoch": 0.5661058560184851, + "grad_norm": 0.634662389755249, + "learning_rate": 8.967575094489756e-06, + "loss": 0.0163, + "step": 8820 + }, + { + "epoch": 0.5667476993926557, + "grad_norm": 0.8215422630310059, + "learning_rate": 8.954313374444666e-06, + "loss": 0.0277, + "step": 8830 + }, + { + "epoch": 0.5673895427668263, + "grad_norm": 0.9724500179290771, + "learning_rate": 8.941051654399577e-06, + "loss": 0.02, + "step": 8840 + }, + { + "epoch": 0.5680313861409969, + "grad_norm": 5.943985939025879, + "learning_rate": 8.927789934354486e-06, + "loss": 0.0225, + "step": 8850 + }, + { + "epoch": 0.5686732295151675, + "grad_norm": 2.096219301223755, + "learning_rate": 8.914528214309397e-06, + "loss": 0.0417, + "step": 8860 + }, + { + "epoch": 0.5693150728893381, + "grad_norm": 4.453114986419678, + "learning_rate": 8.901266494264307e-06, + "loss": 0.0248, + "step": 8870 + }, + { + "epoch": 0.5699569162635088, + "grad_norm": 4.170821189880371, + "learning_rate": 8.888004774219216e-06, + "loss": 0.0231, + "step": 8880 + }, + { + "epoch": 0.5705987596376794, + "grad_norm": 1.1110122203826904, + "learning_rate": 8.874743054174128e-06, + "loss": 0.0282, + "step": 8890 + }, + { + "epoch": 0.57124060301185, + "grad_norm": 2.6466012001037598, + "learning_rate": 8.861481334129037e-06, + "loss": 0.0221, + "step": 8900 + }, + { + "epoch": 0.5718824463860207, + "grad_norm": 0.25523126125335693, + "learning_rate": 8.848219614083947e-06, + "loss": 0.0127, + "step": 8910 + }, + { + "epoch": 0.5725242897601913, + "grad_norm": 3.594900608062744, + "learning_rate": 8.834957894038858e-06, + "loss": 0.0209, + "step": 8920 + }, + { + "epoch": 0.5731661331343619, + "grad_norm": 0.8159608840942383, + "learning_rate": 8.821696173993767e-06, + "loss": 0.018, + "step": 8930 + }, + { + "epoch": 0.5738079765085325, + "grad_norm": 0.9161319732666016, + "learning_rate": 8.808434453948678e-06, + "loss": 0.0368, + "step": 8940 + }, + { + "epoch": 0.5744498198827032, + "grad_norm": 0.16555462777614594, + "learning_rate": 8.795172733903588e-06, + "loss": 0.0199, + "step": 8950 + }, + { + "epoch": 0.5750916632568738, + "grad_norm": 3.5098259449005127, + "learning_rate": 8.781911013858499e-06, + "loss": 0.0296, + "step": 8960 + }, + { + "epoch": 0.5757335066310444, + "grad_norm": 0.0742286741733551, + "learning_rate": 8.768649293813408e-06, + "loss": 0.038, + "step": 8970 + }, + { + "epoch": 0.576375350005215, + "grad_norm": 1.3032357692718506, + "learning_rate": 8.755387573768318e-06, + "loss": 0.0373, + "step": 8980 + }, + { + "epoch": 0.5770171933793856, + "grad_norm": 0.9205700755119324, + "learning_rate": 8.742125853723229e-06, + "loss": 0.0328, + "step": 8990 + }, + { + "epoch": 0.5776590367535562, + "grad_norm": 1.2080702781677246, + "learning_rate": 8.728864133678139e-06, + "loss": 0.0313, + "step": 9000 + }, + { + "epoch": 0.5783008801277268, + "grad_norm": 0.5622536540031433, + "learning_rate": 8.715602413633048e-06, + "loss": 0.0206, + "step": 9010 + }, + { + "epoch": 0.5789427235018975, + "grad_norm": 0.46471625566482544, + "learning_rate": 8.70234069358796e-06, + "loss": 0.0324, + "step": 9020 + }, + { + "epoch": 0.5795845668760681, + "grad_norm": 0.054508015513420105, + "learning_rate": 8.689078973542869e-06, + "loss": 0.0242, + "step": 9030 + }, + { + "epoch": 0.5802264102502387, + "grad_norm": 8.564746856689453, + "learning_rate": 8.67581725349778e-06, + "loss": 0.0328, + "step": 9040 + }, + { + "epoch": 0.5808682536244093, + "grad_norm": 0.13193635642528534, + "learning_rate": 8.66255553345269e-06, + "loss": 0.0239, + "step": 9050 + }, + { + "epoch": 0.5815100969985799, + "grad_norm": 0.4824690520763397, + "learning_rate": 8.6492938134076e-06, + "loss": 0.0225, + "step": 9060 + }, + { + "epoch": 0.5821519403727505, + "grad_norm": 0.7100673913955688, + "learning_rate": 8.63603209336251e-06, + "loss": 0.0334, + "step": 9070 + }, + { + "epoch": 0.5827937837469211, + "grad_norm": 0.1487302929162979, + "learning_rate": 8.62277037331742e-06, + "loss": 0.0325, + "step": 9080 + }, + { + "epoch": 0.5834356271210918, + "grad_norm": 0.15965911746025085, + "learning_rate": 8.60950865327233e-06, + "loss": 0.0248, + "step": 9090 + }, + { + "epoch": 0.5840774704952624, + "grad_norm": 3.24701189994812, + "learning_rate": 8.59624693322724e-06, + "loss": 0.034, + "step": 9100 + }, + { + "epoch": 0.584719313869433, + "grad_norm": 0.09657014906406403, + "learning_rate": 8.58298521318215e-06, + "loss": 0.032, + "step": 9110 + }, + { + "epoch": 0.5853611572436036, + "grad_norm": 2.3377416133880615, + "learning_rate": 8.56972349313706e-06, + "loss": 0.0441, + "step": 9120 + }, + { + "epoch": 0.5860030006177742, + "grad_norm": 1.443271279335022, + "learning_rate": 8.55646177309197e-06, + "loss": 0.0247, + "step": 9130 + }, + { + "epoch": 0.5866448439919448, + "grad_norm": 3.8856918811798096, + "learning_rate": 8.54320005304688e-06, + "loss": 0.0189, + "step": 9140 + }, + { + "epoch": 0.5872866873661154, + "grad_norm": 0.44424521923065186, + "learning_rate": 8.529938333001791e-06, + "loss": 0.0216, + "step": 9150 + }, + { + "epoch": 0.5879285307402861, + "grad_norm": 0.9118013978004456, + "learning_rate": 8.516676612956702e-06, + "loss": 0.0252, + "step": 9160 + }, + { + "epoch": 0.5885703741144567, + "grad_norm": 1.9455515146255493, + "learning_rate": 8.503414892911612e-06, + "loss": 0.0174, + "step": 9170 + }, + { + "epoch": 0.5892122174886273, + "grad_norm": 0.43138858675956726, + "learning_rate": 8.490153172866521e-06, + "loss": 0.0227, + "step": 9180 + }, + { + "epoch": 0.5898540608627979, + "grad_norm": 1.8877631425857544, + "learning_rate": 8.476891452821432e-06, + "loss": 0.0376, + "step": 9190 + }, + { + "epoch": 0.5904959042369686, + "grad_norm": 0.6855882406234741, + "learning_rate": 8.463629732776342e-06, + "loss": 0.0261, + "step": 9200 + }, + { + "epoch": 0.5911377476111392, + "grad_norm": 1.141603946685791, + "learning_rate": 8.450368012731251e-06, + "loss": 0.0382, + "step": 9210 + }, + { + "epoch": 0.5917795909853099, + "grad_norm": 0.5965757369995117, + "learning_rate": 8.437106292686162e-06, + "loss": 0.0264, + "step": 9220 + }, + { + "epoch": 0.5924214343594805, + "grad_norm": 6.3784637451171875, + "learning_rate": 8.423844572641072e-06, + "loss": 0.0331, + "step": 9230 + }, + { + "epoch": 0.5930632777336511, + "grad_norm": 1.8399165868759155, + "learning_rate": 8.410582852595981e-06, + "loss": 0.0178, + "step": 9240 + }, + { + "epoch": 0.5937051211078217, + "grad_norm": 0.6760837435722351, + "learning_rate": 8.397321132550892e-06, + "loss": 0.0219, + "step": 9250 + }, + { + "epoch": 0.5943469644819923, + "grad_norm": 0.23297323286533356, + "learning_rate": 8.384059412505802e-06, + "loss": 0.0161, + "step": 9260 + }, + { + "epoch": 0.5949888078561629, + "grad_norm": 0.9712881445884705, + "learning_rate": 8.370797692460713e-06, + "loss": 0.0139, + "step": 9270 + }, + { + "epoch": 0.5956306512303335, + "grad_norm": 0.10685652494430542, + "learning_rate": 8.357535972415624e-06, + "loss": 0.0258, + "step": 9280 + }, + { + "epoch": 0.5962724946045042, + "grad_norm": 1.6632583141326904, + "learning_rate": 8.344274252370534e-06, + "loss": 0.0251, + "step": 9290 + }, + { + "epoch": 0.5969143379786748, + "grad_norm": 0.5271076560020447, + "learning_rate": 8.331012532325443e-06, + "loss": 0.0291, + "step": 9300 + }, + { + "epoch": 0.5975561813528454, + "grad_norm": 1.0587668418884277, + "learning_rate": 8.317750812280353e-06, + "loss": 0.0257, + "step": 9310 + }, + { + "epoch": 0.598198024727016, + "grad_norm": 1.011917233467102, + "learning_rate": 8.304489092235264e-06, + "loss": 0.0309, + "step": 9320 + }, + { + "epoch": 0.5988398681011866, + "grad_norm": 0.80730801820755, + "learning_rate": 8.291227372190173e-06, + "loss": 0.0256, + "step": 9330 + }, + { + "epoch": 0.5994817114753572, + "grad_norm": 0.4712038040161133, + "learning_rate": 8.277965652145083e-06, + "loss": 0.021, + "step": 9340 + }, + { + "epoch": 0.6001235548495278, + "grad_norm": 0.05072961002588272, + "learning_rate": 8.264703932099994e-06, + "loss": 0.0118, + "step": 9350 + }, + { + "epoch": 0.6007653982236985, + "grad_norm": 5.919791221618652, + "learning_rate": 8.251442212054904e-06, + "loss": 0.0259, + "step": 9360 + }, + { + "epoch": 0.6014072415978691, + "grad_norm": 0.14993645250797272, + "learning_rate": 8.238180492009815e-06, + "loss": 0.0088, + "step": 9370 + }, + { + "epoch": 0.6020490849720397, + "grad_norm": 4.77583646774292, + "learning_rate": 8.224918771964724e-06, + "loss": 0.0176, + "step": 9380 + }, + { + "epoch": 0.6026909283462103, + "grad_norm": 0.11115249246358871, + "learning_rate": 8.211657051919635e-06, + "loss": 0.0282, + "step": 9390 + }, + { + "epoch": 0.6033327717203809, + "grad_norm": 1.8065743446350098, + "learning_rate": 8.198395331874545e-06, + "loss": 0.0161, + "step": 9400 + }, + { + "epoch": 0.6039746150945515, + "grad_norm": 0.1328521966934204, + "learning_rate": 8.185133611829454e-06, + "loss": 0.0223, + "step": 9410 + }, + { + "epoch": 0.6046164584687221, + "grad_norm": 1.3424381017684937, + "learning_rate": 8.171871891784365e-06, + "loss": 0.0189, + "step": 9420 + }, + { + "epoch": 0.6052583018428928, + "grad_norm": 1.714998722076416, + "learning_rate": 8.158610171739275e-06, + "loss": 0.0251, + "step": 9430 + }, + { + "epoch": 0.6059001452170634, + "grad_norm": 2.506918430328369, + "learning_rate": 8.145348451694184e-06, + "loss": 0.0187, + "step": 9440 + }, + { + "epoch": 0.606541988591234, + "grad_norm": 3.1794984340667725, + "learning_rate": 8.132086731649096e-06, + "loss": 0.0297, + "step": 9450 + }, + { + "epoch": 0.6071838319654046, + "grad_norm": 0.3733360469341278, + "learning_rate": 8.118825011604005e-06, + "loss": 0.0181, + "step": 9460 + }, + { + "epoch": 0.6078256753395752, + "grad_norm": 2.3479220867156982, + "learning_rate": 8.105563291558916e-06, + "loss": 0.023, + "step": 9470 + }, + { + "epoch": 0.6084675187137458, + "grad_norm": 1.5137450695037842, + "learning_rate": 8.092301571513826e-06, + "loss": 0.0137, + "step": 9480 + }, + { + "epoch": 0.6091093620879166, + "grad_norm": 0.19078929722309113, + "learning_rate": 8.079039851468737e-06, + "loss": 0.021, + "step": 9490 + }, + { + "epoch": 0.6097512054620872, + "grad_norm": 2.02891206741333, + "learning_rate": 8.065778131423646e-06, + "loss": 0.0186, + "step": 9500 + }, + { + "epoch": 0.6103930488362578, + "grad_norm": 2.559931993484497, + "learning_rate": 8.052516411378558e-06, + "loss": 0.0236, + "step": 9510 + }, + { + "epoch": 0.6110348922104284, + "grad_norm": 0.35723981261253357, + "learning_rate": 8.039254691333467e-06, + "loss": 0.013, + "step": 9520 + }, + { + "epoch": 0.611676735584599, + "grad_norm": 1.501561164855957, + "learning_rate": 8.025992971288377e-06, + "loss": 0.0262, + "step": 9530 + }, + { + "epoch": 0.6123185789587696, + "grad_norm": 1.1635639667510986, + "learning_rate": 8.012731251243286e-06, + "loss": 0.0238, + "step": 9540 + }, + { + "epoch": 0.6129604223329402, + "grad_norm": 0.27196788787841797, + "learning_rate": 7.999469531198197e-06, + "loss": 0.0198, + "step": 9550 + }, + { + "epoch": 0.6136022657071108, + "grad_norm": 2.936884880065918, + "learning_rate": 7.986207811153107e-06, + "loss": 0.0324, + "step": 9560 + }, + { + "epoch": 0.6142441090812815, + "grad_norm": 0.5064958930015564, + "learning_rate": 7.972946091108018e-06, + "loss": 0.0149, + "step": 9570 + }, + { + "epoch": 0.6148859524554521, + "grad_norm": 0.12043922394514084, + "learning_rate": 7.959684371062927e-06, + "loss": 0.0205, + "step": 9580 + }, + { + "epoch": 0.6155277958296227, + "grad_norm": 0.04327112436294556, + "learning_rate": 7.946422651017838e-06, + "loss": 0.0392, + "step": 9590 + }, + { + "epoch": 0.6161696392037933, + "grad_norm": 0.17598643898963928, + "learning_rate": 7.933160930972748e-06, + "loss": 0.018, + "step": 9600 + }, + { + "epoch": 0.6168114825779639, + "grad_norm": 3.665308713912964, + "learning_rate": 7.919899210927659e-06, + "loss": 0.0239, + "step": 9610 + }, + { + "epoch": 0.6174533259521345, + "grad_norm": 4.262577056884766, + "learning_rate": 7.906637490882569e-06, + "loss": 0.027, + "step": 9620 + }, + { + "epoch": 0.6180951693263051, + "grad_norm": 2.2056829929351807, + "learning_rate": 7.893375770837478e-06, + "loss": 0.0276, + "step": 9630 + }, + { + "epoch": 0.6187370127004758, + "grad_norm": 4.004680633544922, + "learning_rate": 7.880114050792388e-06, + "loss": 0.0194, + "step": 9640 + }, + { + "epoch": 0.6193788560746464, + "grad_norm": 0.2112032175064087, + "learning_rate": 7.866852330747299e-06, + "loss": 0.0177, + "step": 9650 + }, + { + "epoch": 0.620020699448817, + "grad_norm": 1.0510079860687256, + "learning_rate": 7.853590610702208e-06, + "loss": 0.0263, + "step": 9660 + }, + { + "epoch": 0.6206625428229876, + "grad_norm": 0.08418594300746918, + "learning_rate": 7.840328890657118e-06, + "loss": 0.0236, + "step": 9670 + }, + { + "epoch": 0.6213043861971582, + "grad_norm": 2.5858445167541504, + "learning_rate": 7.827067170612029e-06, + "loss": 0.0228, + "step": 9680 + }, + { + "epoch": 0.6219462295713288, + "grad_norm": 0.09412383288145065, + "learning_rate": 7.813805450566938e-06, + "loss": 0.0235, + "step": 9690 + }, + { + "epoch": 0.6225880729454994, + "grad_norm": 1.0433266162872314, + "learning_rate": 7.80054373052185e-06, + "loss": 0.0243, + "step": 9700 + }, + { + "epoch": 0.6232299163196701, + "grad_norm": 1.1485860347747803, + "learning_rate": 7.78728201047676e-06, + "loss": 0.0281, + "step": 9710 + }, + { + "epoch": 0.6238717596938407, + "grad_norm": 2.220686674118042, + "learning_rate": 7.77402029043167e-06, + "loss": 0.0242, + "step": 9720 + }, + { + "epoch": 0.6245136030680113, + "grad_norm": 1.1384696960449219, + "learning_rate": 7.76075857038658e-06, + "loss": 0.0226, + "step": 9730 + }, + { + "epoch": 0.6251554464421819, + "grad_norm": 3.107987880706787, + "learning_rate": 7.74749685034149e-06, + "loss": 0.034, + "step": 9740 + }, + { + "epoch": 0.6257972898163525, + "grad_norm": 0.5255154967308044, + "learning_rate": 7.7342351302964e-06, + "loss": 0.0157, + "step": 9750 + }, + { + "epoch": 0.6264391331905231, + "grad_norm": 0.11366035789251328, + "learning_rate": 7.72097341025131e-06, + "loss": 0.0122, + "step": 9760 + }, + { + "epoch": 0.6270809765646937, + "grad_norm": 2.00895094871521, + "learning_rate": 7.70771169020622e-06, + "loss": 0.0222, + "step": 9770 + }, + { + "epoch": 0.6277228199388645, + "grad_norm": 3.02500319480896, + "learning_rate": 7.69444997016113e-06, + "loss": 0.016, + "step": 9780 + }, + { + "epoch": 0.6283646633130351, + "grad_norm": 4.960433483123779, + "learning_rate": 7.68118825011604e-06, + "loss": 0.0247, + "step": 9790 + }, + { + "epoch": 0.6290065066872057, + "grad_norm": 0.007853507064282894, + "learning_rate": 7.667926530070951e-06, + "loss": 0.0364, + "step": 9800 + }, + { + "epoch": 0.6296483500613763, + "grad_norm": 2.501887559890747, + "learning_rate": 7.65466481002586e-06, + "loss": 0.0265, + "step": 9810 + }, + { + "epoch": 0.6302901934355469, + "grad_norm": 0.5652506351470947, + "learning_rate": 7.641403089980772e-06, + "loss": 0.0146, + "step": 9820 + }, + { + "epoch": 0.6309320368097175, + "grad_norm": 4.806582927703857, + "learning_rate": 7.628141369935681e-06, + "loss": 0.0161, + "step": 9830 + }, + { + "epoch": 0.6315738801838882, + "grad_norm": 1.12467622756958, + "learning_rate": 7.6148796498905915e-06, + "loss": 0.0278, + "step": 9840 + }, + { + "epoch": 0.6322157235580588, + "grad_norm": 1.5796605348587036, + "learning_rate": 7.601617929845502e-06, + "loss": 0.0246, + "step": 9850 + }, + { + "epoch": 0.6328575669322294, + "grad_norm": 1.3487434387207031, + "learning_rate": 7.588356209800411e-06, + "loss": 0.0126, + "step": 9860 + }, + { + "epoch": 0.6334994103064, + "grad_norm": 0.05349859222769737, + "learning_rate": 7.575094489755322e-06, + "loss": 0.0215, + "step": 9870 + }, + { + "epoch": 0.6341412536805706, + "grad_norm": 1.3048077821731567, + "learning_rate": 7.561832769710232e-06, + "loss": 0.0223, + "step": 9880 + }, + { + "epoch": 0.6347830970547412, + "grad_norm": 0.46381476521492004, + "learning_rate": 7.548571049665142e-06, + "loss": 0.0167, + "step": 9890 + }, + { + "epoch": 0.6354249404289118, + "grad_norm": 0.19564686715602875, + "learning_rate": 7.535309329620052e-06, + "loss": 0.0129, + "step": 9900 + }, + { + "epoch": 0.6360667838030825, + "grad_norm": 1.4157600402832031, + "learning_rate": 7.522047609574963e-06, + "loss": 0.0121, + "step": 9910 + }, + { + "epoch": 0.6367086271772531, + "grad_norm": 0.7896568179130554, + "learning_rate": 7.5087858895298724e-06, + "loss": 0.021, + "step": 9920 + }, + { + "epoch": 0.6373504705514237, + "grad_norm": 0.48979702591896057, + "learning_rate": 7.495524169484783e-06, + "loss": 0.0423, + "step": 9930 + }, + { + "epoch": 0.6379923139255943, + "grad_norm": 0.4577849805355072, + "learning_rate": 7.482262449439693e-06, + "loss": 0.0214, + "step": 9940 + }, + { + "epoch": 0.6386341572997649, + "grad_norm": 0.2398267239332199, + "learning_rate": 7.469000729394603e-06, + "loss": 0.0307, + "step": 9950 + }, + { + "epoch": 0.6392760006739355, + "grad_norm": 0.11448551714420319, + "learning_rate": 7.455739009349513e-06, + "loss": 0.0222, + "step": 9960 + }, + { + "epoch": 0.6399178440481061, + "grad_norm": 3.8593854904174805, + "learning_rate": 7.442477289304424e-06, + "loss": 0.0319, + "step": 9970 + }, + { + "epoch": 0.6405596874222768, + "grad_norm": 0.3872242271900177, + "learning_rate": 7.4292155692593335e-06, + "loss": 0.0297, + "step": 9980 + }, + { + "epoch": 0.6412015307964474, + "grad_norm": 0.513504683971405, + "learning_rate": 7.415953849214244e-06, + "loss": 0.0307, + "step": 9990 + }, + { + "epoch": 0.641843374170618, + "grad_norm": 5.903943061828613, + "learning_rate": 7.402692129169153e-06, + "loss": 0.0392, + "step": 10000 + }, + { + "epoch": 0.6424852175447886, + "grad_norm": 0.5244473218917847, + "learning_rate": 7.3894304091240645e-06, + "loss": 0.034, + "step": 10010 + }, + { + "epoch": 0.6431270609189592, + "grad_norm": 0.04969165101647377, + "learning_rate": 7.376168689078974e-06, + "loss": 0.012, + "step": 10020 + }, + { + "epoch": 0.6437689042931298, + "grad_norm": 2.9083058834075928, + "learning_rate": 7.362906969033884e-06, + "loss": 0.0293, + "step": 10030 + }, + { + "epoch": 0.6444107476673004, + "grad_norm": 0.10136771202087402, + "learning_rate": 7.349645248988795e-06, + "loss": 0.0228, + "step": 10040 + }, + { + "epoch": 0.6450525910414711, + "grad_norm": 0.04527024179697037, + "learning_rate": 7.336383528943705e-06, + "loss": 0.024, + "step": 10050 + }, + { + "epoch": 0.6456944344156417, + "grad_norm": 0.4695492684841156, + "learning_rate": 7.3231218088986144e-06, + "loss": 0.0185, + "step": 10060 + }, + { + "epoch": 0.6463362777898123, + "grad_norm": 2.390000343322754, + "learning_rate": 7.309860088853526e-06, + "loss": 0.0189, + "step": 10070 + }, + { + "epoch": 0.646978121163983, + "grad_norm": 0.30994078516960144, + "learning_rate": 7.296598368808435e-06, + "loss": 0.0212, + "step": 10080 + }, + { + "epoch": 0.6476199645381536, + "grad_norm": 1.3190053701400757, + "learning_rate": 7.283336648763345e-06, + "loss": 0.0312, + "step": 10090 + }, + { + "epoch": 0.6482618079123242, + "grad_norm": 5.3458662033081055, + "learning_rate": 7.270074928718255e-06, + "loss": 0.0191, + "step": 10100 + }, + { + "epoch": 0.6489036512864949, + "grad_norm": 1.1833816766738892, + "learning_rate": 7.256813208673166e-06, + "loss": 0.0311, + "step": 10110 + }, + { + "epoch": 0.6495454946606655, + "grad_norm": 0.6074949502944946, + "learning_rate": 7.2435514886280755e-06, + "loss": 0.0258, + "step": 10120 + }, + { + "epoch": 0.6501873380348361, + "grad_norm": 1.357013463973999, + "learning_rate": 7.230289768582985e-06, + "loss": 0.0253, + "step": 10130 + }, + { + "epoch": 0.6508291814090067, + "grad_norm": 0.33180567622184753, + "learning_rate": 7.217028048537896e-06, + "loss": 0.0288, + "step": 10140 + }, + { + "epoch": 0.6514710247831773, + "grad_norm": 0.6609655022621155, + "learning_rate": 7.2037663284928065e-06, + "loss": 0.0206, + "step": 10150 + }, + { + "epoch": 0.6521128681573479, + "grad_norm": 0.3761395812034607, + "learning_rate": 7.190504608447716e-06, + "loss": 0.0189, + "step": 10160 + }, + { + "epoch": 0.6527547115315185, + "grad_norm": 0.29698699712753296, + "learning_rate": 7.177242888402627e-06, + "loss": 0.0219, + "step": 10170 + }, + { + "epoch": 0.6533965549056892, + "grad_norm": 0.464578777551651, + "learning_rate": 7.163981168357537e-06, + "loss": 0.0272, + "step": 10180 + }, + { + "epoch": 0.6540383982798598, + "grad_norm": 2.6864356994628906, + "learning_rate": 7.150719448312446e-06, + "loss": 0.0167, + "step": 10190 + }, + { + "epoch": 0.6546802416540304, + "grad_norm": 2.0054922103881836, + "learning_rate": 7.137457728267357e-06, + "loss": 0.0113, + "step": 10200 + }, + { + "epoch": 0.655322085028201, + "grad_norm": 1.1829546689987183, + "learning_rate": 7.124196008222267e-06, + "loss": 0.0173, + "step": 10210 + }, + { + "epoch": 0.6559639284023716, + "grad_norm": 0.9671972990036011, + "learning_rate": 7.110934288177177e-06, + "loss": 0.0224, + "step": 10220 + }, + { + "epoch": 0.6566057717765422, + "grad_norm": 1.9605666399002075, + "learning_rate": 7.097672568132087e-06, + "loss": 0.0228, + "step": 10230 + }, + { + "epoch": 0.6572476151507128, + "grad_norm": 1.4973517656326294, + "learning_rate": 7.084410848086998e-06, + "loss": 0.0363, + "step": 10240 + }, + { + "epoch": 0.6578894585248835, + "grad_norm": 0.1850045919418335, + "learning_rate": 7.071149128041907e-06, + "loss": 0.0302, + "step": 10250 + }, + { + "epoch": 0.6585313018990541, + "grad_norm": 0.5483193397521973, + "learning_rate": 7.0578874079968176e-06, + "loss": 0.0183, + "step": 10260 + }, + { + "epoch": 0.6591731452732247, + "grad_norm": 1.6930410861968994, + "learning_rate": 7.044625687951728e-06, + "loss": 0.0204, + "step": 10270 + }, + { + "epoch": 0.6598149886473953, + "grad_norm": 0.10521617531776428, + "learning_rate": 7.031363967906638e-06, + "loss": 0.025, + "step": 10280 + }, + { + "epoch": 0.6604568320215659, + "grad_norm": 0.20267446339130402, + "learning_rate": 7.018102247861548e-06, + "loss": 0.0288, + "step": 10290 + }, + { + "epoch": 0.6610986753957365, + "grad_norm": 0.15048721432685852, + "learning_rate": 7.004840527816459e-06, + "loss": 0.0309, + "step": 10300 + }, + { + "epoch": 0.6617405187699071, + "grad_norm": 0.7346864938735962, + "learning_rate": 6.991578807771368e-06, + "loss": 0.0363, + "step": 10310 + }, + { + "epoch": 0.6623823621440778, + "grad_norm": 0.04179545119404793, + "learning_rate": 6.978317087726279e-06, + "loss": 0.0145, + "step": 10320 + }, + { + "epoch": 0.6630242055182484, + "grad_norm": 0.3221127390861511, + "learning_rate": 6.965055367681188e-06, + "loss": 0.0381, + "step": 10330 + }, + { + "epoch": 0.663666048892419, + "grad_norm": 0.3161191940307617, + "learning_rate": 6.951793647636099e-06, + "loss": 0.0201, + "step": 10340 + }, + { + "epoch": 0.6643078922665896, + "grad_norm": 0.6822152137756348, + "learning_rate": 6.938531927591009e-06, + "loss": 0.0115, + "step": 10350 + }, + { + "epoch": 0.6649497356407602, + "grad_norm": 0.29630982875823975, + "learning_rate": 6.925270207545919e-06, + "loss": 0.02, + "step": 10360 + }, + { + "epoch": 0.6655915790149309, + "grad_norm": 0.5653153657913208, + "learning_rate": 6.9120084875008294e-06, + "loss": 0.0197, + "step": 10370 + }, + { + "epoch": 0.6662334223891015, + "grad_norm": 0.08264129608869553, + "learning_rate": 6.89874676745574e-06, + "loss": 0.0173, + "step": 10380 + }, + { + "epoch": 0.6668752657632722, + "grad_norm": 1.7407037019729614, + "learning_rate": 6.885485047410649e-06, + "loss": 0.0172, + "step": 10390 + }, + { + "epoch": 0.6675171091374428, + "grad_norm": 0.17370440065860748, + "learning_rate": 6.87222332736556e-06, + "loss": 0.0233, + "step": 10400 + }, + { + "epoch": 0.6681589525116134, + "grad_norm": 0.6718391180038452, + "learning_rate": 6.85896160732047e-06, + "loss": 0.029, + "step": 10410 + }, + { + "epoch": 0.668800795885784, + "grad_norm": 0.21962031722068787, + "learning_rate": 6.84569988727538e-06, + "loss": 0.0157, + "step": 10420 + }, + { + "epoch": 0.6694426392599546, + "grad_norm": 1.031297206878662, + "learning_rate": 6.8324381672302905e-06, + "loss": 0.0296, + "step": 10430 + }, + { + "epoch": 0.6700844826341252, + "grad_norm": 3.4837470054626465, + "learning_rate": 6.819176447185201e-06, + "loss": 0.0358, + "step": 10440 + }, + { + "epoch": 0.6707263260082958, + "grad_norm": 0.05151769146323204, + "learning_rate": 6.80591472714011e-06, + "loss": 0.0316, + "step": 10450 + }, + { + "epoch": 0.6713681693824665, + "grad_norm": 0.693645715713501, + "learning_rate": 6.792653007095021e-06, + "loss": 0.034, + "step": 10460 + }, + { + "epoch": 0.6720100127566371, + "grad_norm": 1.7422330379486084, + "learning_rate": 6.779391287049931e-06, + "loss": 0.0133, + "step": 10470 + }, + { + "epoch": 0.6726518561308077, + "grad_norm": 1.208640456199646, + "learning_rate": 6.766129567004841e-06, + "loss": 0.0339, + "step": 10480 + }, + { + "epoch": 0.6732936995049783, + "grad_norm": 3.527818202972412, + "learning_rate": 6.752867846959751e-06, + "loss": 0.0204, + "step": 10490 + }, + { + "epoch": 0.6739355428791489, + "grad_norm": 0.4659969210624695, + "learning_rate": 6.739606126914662e-06, + "loss": 0.0231, + "step": 10500 + }, + { + "epoch": 0.6745773862533195, + "grad_norm": 0.316992849111557, + "learning_rate": 6.7263444068695715e-06, + "loss": 0.0151, + "step": 10510 + }, + { + "epoch": 0.6752192296274901, + "grad_norm": 0.5036879181861877, + "learning_rate": 6.713082686824482e-06, + "loss": 0.0274, + "step": 10520 + }, + { + "epoch": 0.6758610730016608, + "grad_norm": 2.00020170211792, + "learning_rate": 6.699820966779392e-06, + "loss": 0.0209, + "step": 10530 + }, + { + "epoch": 0.6765029163758314, + "grad_norm": 0.012486232444643974, + "learning_rate": 6.6865592467343024e-06, + "loss": 0.0187, + "step": 10540 + }, + { + "epoch": 0.677144759750002, + "grad_norm": 0.03590549901127815, + "learning_rate": 6.673297526689212e-06, + "loss": 0.0298, + "step": 10550 + }, + { + "epoch": 0.6777866031241726, + "grad_norm": 0.11736287921667099, + "learning_rate": 6.660035806644121e-06, + "loss": 0.0214, + "step": 10560 + }, + { + "epoch": 0.6784284464983432, + "grad_norm": 0.8670855164527893, + "learning_rate": 6.6467740865990326e-06, + "loss": 0.015, + "step": 10570 + }, + { + "epoch": 0.6790702898725138, + "grad_norm": 5.439849376678467, + "learning_rate": 6.633512366553943e-06, + "loss": 0.0259, + "step": 10580 + }, + { + "epoch": 0.6797121332466844, + "grad_norm": 0.3086478114128113, + "learning_rate": 6.620250646508852e-06, + "loss": 0.018, + "step": 10590 + }, + { + "epoch": 0.6803539766208551, + "grad_norm": 0.7371939420700073, + "learning_rate": 6.6069889264637635e-06, + "loss": 0.0192, + "step": 10600 + }, + { + "epoch": 0.6809958199950257, + "grad_norm": 0.051234494894742966, + "learning_rate": 6.593727206418673e-06, + "loss": 0.0196, + "step": 10610 + }, + { + "epoch": 0.6816376633691963, + "grad_norm": 0.4612034559249878, + "learning_rate": 6.5804654863735825e-06, + "loss": 0.0163, + "step": 10620 + }, + { + "epoch": 0.6822795067433669, + "grad_norm": 0.1792960911989212, + "learning_rate": 6.567203766328494e-06, + "loss": 0.0277, + "step": 10630 + }, + { + "epoch": 0.6829213501175375, + "grad_norm": 0.02334243804216385, + "learning_rate": 6.553942046283404e-06, + "loss": 0.0167, + "step": 10640 + }, + { + "epoch": 0.6835631934917081, + "grad_norm": 0.13844627141952515, + "learning_rate": 6.5406803262383135e-06, + "loss": 0.02, + "step": 10650 + }, + { + "epoch": 0.6842050368658789, + "grad_norm": 1.1921093463897705, + "learning_rate": 6.527418606193225e-06, + "loss": 0.0408, + "step": 10660 + }, + { + "epoch": 0.6848468802400495, + "grad_norm": 6.113221645355225, + "learning_rate": 6.514156886148134e-06, + "loss": 0.0202, + "step": 10670 + }, + { + "epoch": 0.6854887236142201, + "grad_norm": 2.2112908363342285, + "learning_rate": 6.500895166103044e-06, + "loss": 0.0449, + "step": 10680 + }, + { + "epoch": 0.6861305669883907, + "grad_norm": 0.7895561456680298, + "learning_rate": 6.487633446057954e-06, + "loss": 0.0246, + "step": 10690 + }, + { + "epoch": 0.6867724103625613, + "grad_norm": 0.5855046510696411, + "learning_rate": 6.474371726012865e-06, + "loss": 0.0248, + "step": 10700 + }, + { + "epoch": 0.6874142537367319, + "grad_norm": 6.148792266845703, + "learning_rate": 6.4611100059677746e-06, + "loss": 0.024, + "step": 10710 + }, + { + "epoch": 0.6880560971109025, + "grad_norm": 0.5399889945983887, + "learning_rate": 6.447848285922684e-06, + "loss": 0.0314, + "step": 10720 + }, + { + "epoch": 0.6886979404850732, + "grad_norm": 1.1208041906356812, + "learning_rate": 6.434586565877595e-06, + "loss": 0.0102, + "step": 10730 + }, + { + "epoch": 0.6893397838592438, + "grad_norm": 0.8087825775146484, + "learning_rate": 6.421324845832505e-06, + "loss": 0.0272, + "step": 10740 + }, + { + "epoch": 0.6899816272334144, + "grad_norm": 0.9773532152175903, + "learning_rate": 6.408063125787415e-06, + "loss": 0.0196, + "step": 10750 + }, + { + "epoch": 0.690623470607585, + "grad_norm": 0.06517565995454788, + "learning_rate": 6.394801405742325e-06, + "loss": 0.0133, + "step": 10760 + }, + { + "epoch": 0.6912653139817556, + "grad_norm": 0.08576376736164093, + "learning_rate": 6.381539685697236e-06, + "loss": 0.0205, + "step": 10770 + }, + { + "epoch": 0.6919071573559262, + "grad_norm": 0.09191477298736572, + "learning_rate": 6.368277965652145e-06, + "loss": 0.0237, + "step": 10780 + }, + { + "epoch": 0.6925490007300968, + "grad_norm": 1.1734598875045776, + "learning_rate": 6.3550162456070555e-06, + "loss": 0.0203, + "step": 10790 + }, + { + "epoch": 0.6931908441042675, + "grad_norm": 1.6206161975860596, + "learning_rate": 6.341754525561966e-06, + "loss": 0.0267, + "step": 10800 + }, + { + "epoch": 0.6938326874784381, + "grad_norm": 0.42549261450767517, + "learning_rate": 6.328492805516876e-06, + "loss": 0.0261, + "step": 10810 + }, + { + "epoch": 0.6944745308526087, + "grad_norm": 0.44076070189476013, + "learning_rate": 6.315231085471786e-06, + "loss": 0.0248, + "step": 10820 + }, + { + "epoch": 0.6951163742267793, + "grad_norm": 0.2725442349910736, + "learning_rate": 6.301969365426697e-06, + "loss": 0.0228, + "step": 10830 + }, + { + "epoch": 0.6957582176009499, + "grad_norm": 3.0624146461486816, + "learning_rate": 6.288707645381606e-06, + "loss": 0.0415, + "step": 10840 + }, + { + "epoch": 0.6964000609751205, + "grad_norm": 2.1957530975341797, + "learning_rate": 6.275445925336517e-06, + "loss": 0.0277, + "step": 10850 + }, + { + "epoch": 0.6970419043492911, + "grad_norm": 0.498536080121994, + "learning_rate": 6.262184205291427e-06, + "loss": 0.0342, + "step": 10860 + }, + { + "epoch": 0.6976837477234618, + "grad_norm": 0.23503071069717407, + "learning_rate": 6.248922485246337e-06, + "loss": 0.0289, + "step": 10870 + }, + { + "epoch": 0.6983255910976324, + "grad_norm": 0.30300644040107727, + "learning_rate": 6.235660765201247e-06, + "loss": 0.016, + "step": 10880 + }, + { + "epoch": 0.698967434471803, + "grad_norm": 0.22650690376758575, + "learning_rate": 6.222399045156158e-06, + "loss": 0.0103, + "step": 10890 + }, + { + "epoch": 0.6996092778459736, + "grad_norm": 0.1359551101922989, + "learning_rate": 6.209137325111067e-06, + "loss": 0.0233, + "step": 10900 + }, + { + "epoch": 0.7002511212201442, + "grad_norm": 0.02224060520529747, + "learning_rate": 6.195875605065978e-06, + "loss": 0.0158, + "step": 10910 + }, + { + "epoch": 0.7008929645943148, + "grad_norm": 1.749082088470459, + "learning_rate": 6.182613885020887e-06, + "loss": 0.0183, + "step": 10920 + }, + { + "epoch": 0.7015348079684854, + "grad_norm": 0.18207766115665436, + "learning_rate": 6.169352164975798e-06, + "loss": 0.0156, + "step": 10930 + }, + { + "epoch": 0.702176651342656, + "grad_norm": 6.35283899307251, + "learning_rate": 6.156090444930708e-06, + "loss": 0.0488, + "step": 10940 + }, + { + "epoch": 0.7028184947168268, + "grad_norm": 0.07517176121473312, + "learning_rate": 6.142828724885618e-06, + "loss": 0.0235, + "step": 10950 + }, + { + "epoch": 0.7034603380909974, + "grad_norm": 3.0476794242858887, + "learning_rate": 6.1295670048405285e-06, + "loss": 0.0201, + "step": 10960 + }, + { + "epoch": 0.704102181465168, + "grad_norm": 0.7774531841278076, + "learning_rate": 6.116305284795439e-06, + "loss": 0.0192, + "step": 10970 + }, + { + "epoch": 0.7047440248393386, + "grad_norm": 0.27221471071243286, + "learning_rate": 6.103043564750348e-06, + "loss": 0.0283, + "step": 10980 + }, + { + "epoch": 0.7053858682135092, + "grad_norm": 1.1724214553833008, + "learning_rate": 6.0897818447052594e-06, + "loss": 0.0131, + "step": 10990 + }, + { + "epoch": 0.7060277115876799, + "grad_norm": 0.020140307024121284, + "learning_rate": 6.076520124660169e-06, + "loss": 0.0216, + "step": 11000 + }, + { + "epoch": 0.7066695549618505, + "grad_norm": 1.0828317403793335, + "learning_rate": 6.063258404615079e-06, + "loss": 0.0184, + "step": 11010 + }, + { + "epoch": 0.7073113983360211, + "grad_norm": 2.071780204772949, + "learning_rate": 6.049996684569989e-06, + "loss": 0.0354, + "step": 11020 + }, + { + "epoch": 0.7079532417101917, + "grad_norm": 0.17378775775432587, + "learning_rate": 6.0367349645249e-06, + "loss": 0.028, + "step": 11030 + }, + { + "epoch": 0.7085950850843623, + "grad_norm": 0.9045881628990173, + "learning_rate": 6.023473244479809e-06, + "loss": 0.0242, + "step": 11040 + }, + { + "epoch": 0.7092369284585329, + "grad_norm": 0.824508011341095, + "learning_rate": 6.010211524434719e-06, + "loss": 0.0131, + "step": 11050 + }, + { + "epoch": 0.7098787718327035, + "grad_norm": 0.4981328248977661, + "learning_rate": 5.99694980438963e-06, + "loss": 0.0233, + "step": 11060 + }, + { + "epoch": 0.7105206152068742, + "grad_norm": 2.5603647232055664, + "learning_rate": 5.98368808434454e-06, + "loss": 0.0295, + "step": 11070 + }, + { + "epoch": 0.7111624585810448, + "grad_norm": 2.7192113399505615, + "learning_rate": 5.97042636429945e-06, + "loss": 0.0177, + "step": 11080 + }, + { + "epoch": 0.7118043019552154, + "grad_norm": 1.1225374937057495, + "learning_rate": 5.957164644254361e-06, + "loss": 0.0203, + "step": 11090 + }, + { + "epoch": 0.712446145329386, + "grad_norm": 1.057196021080017, + "learning_rate": 5.9439029242092705e-06, + "loss": 0.0283, + "step": 11100 + }, + { + "epoch": 0.7130879887035566, + "grad_norm": 0.12977996468544006, + "learning_rate": 5.93064120416418e-06, + "loss": 0.0115, + "step": 11110 + }, + { + "epoch": 0.7137298320777272, + "grad_norm": 1.014158010482788, + "learning_rate": 5.917379484119091e-06, + "loss": 0.0252, + "step": 11120 + }, + { + "epoch": 0.7143716754518978, + "grad_norm": 2.2357592582702637, + "learning_rate": 5.9041177640740015e-06, + "loss": 0.0299, + "step": 11130 + }, + { + "epoch": 0.7150135188260685, + "grad_norm": 0.671495258808136, + "learning_rate": 5.890856044028911e-06, + "loss": 0.0199, + "step": 11140 + }, + { + "epoch": 0.7156553622002391, + "grad_norm": 5.646879196166992, + "learning_rate": 5.87759432398382e-06, + "loss": 0.0361, + "step": 11150 + }, + { + "epoch": 0.7162972055744097, + "grad_norm": 0.16122344136238098, + "learning_rate": 5.864332603938732e-06, + "loss": 0.0269, + "step": 11160 + }, + { + "epoch": 0.7169390489485803, + "grad_norm": 1.2174190282821655, + "learning_rate": 5.851070883893641e-06, + "loss": 0.02, + "step": 11170 + }, + { + "epoch": 0.7175808923227509, + "grad_norm": 0.2357230931520462, + "learning_rate": 5.837809163848551e-06, + "loss": 0.0167, + "step": 11180 + }, + { + "epoch": 0.7182227356969215, + "grad_norm": 2.912205934524536, + "learning_rate": 5.8245474438034626e-06, + "loss": 0.0169, + "step": 11190 + }, + { + "epoch": 0.7188645790710921, + "grad_norm": 4.412971496582031, + "learning_rate": 5.811285723758372e-06, + "loss": 0.0242, + "step": 11200 + }, + { + "epoch": 0.7195064224452627, + "grad_norm": 5.053261756896973, + "learning_rate": 5.7980240037132815e-06, + "loss": 0.0276, + "step": 11210 + }, + { + "epoch": 0.7201482658194334, + "grad_norm": 0.7474462985992432, + "learning_rate": 5.784762283668193e-06, + "loss": 0.0176, + "step": 11220 + }, + { + "epoch": 0.720790109193604, + "grad_norm": 0.7813795208930969, + "learning_rate": 5.771500563623102e-06, + "loss": 0.019, + "step": 11230 + }, + { + "epoch": 0.7214319525677747, + "grad_norm": 5.405880928039551, + "learning_rate": 5.7582388435780125e-06, + "loss": 0.0382, + "step": 11240 + }, + { + "epoch": 0.7220737959419453, + "grad_norm": 0.7542135119438171, + "learning_rate": 5.744977123532922e-06, + "loss": 0.0354, + "step": 11250 + }, + { + "epoch": 0.7227156393161159, + "grad_norm": 0.039900798350572586, + "learning_rate": 5.731715403487833e-06, + "loss": 0.0155, + "step": 11260 + }, + { + "epoch": 0.7233574826902865, + "grad_norm": 0.44073763489723206, + "learning_rate": 5.718453683442743e-06, + "loss": 0.0196, + "step": 11270 + }, + { + "epoch": 0.7239993260644572, + "grad_norm": 0.04682408273220062, + "learning_rate": 5.705191963397653e-06, + "loss": 0.0328, + "step": 11280 + }, + { + "epoch": 0.7246411694386278, + "grad_norm": 0.0709882602095604, + "learning_rate": 5.691930243352563e-06, + "loss": 0.0161, + "step": 11290 + }, + { + "epoch": 0.7252830128127984, + "grad_norm": 2.7055585384368896, + "learning_rate": 5.678668523307474e-06, + "loss": 0.013, + "step": 11300 + }, + { + "epoch": 0.725924856186969, + "grad_norm": 0.7219206690788269, + "learning_rate": 5.665406803262383e-06, + "loss": 0.0145, + "step": 11310 + }, + { + "epoch": 0.7265666995611396, + "grad_norm": 0.4283300042152405, + "learning_rate": 5.652145083217294e-06, + "loss": 0.0217, + "step": 11320 + }, + { + "epoch": 0.7272085429353102, + "grad_norm": 1.872422218322754, + "learning_rate": 5.638883363172204e-06, + "loss": 0.0171, + "step": 11330 + }, + { + "epoch": 0.7278503863094808, + "grad_norm": 0.4110535681247711, + "learning_rate": 5.625621643127114e-06, + "loss": 0.0284, + "step": 11340 + }, + { + "epoch": 0.7284922296836515, + "grad_norm": 1.1330406665802002, + "learning_rate": 5.612359923082024e-06, + "loss": 0.0264, + "step": 11350 + }, + { + "epoch": 0.7291340730578221, + "grad_norm": 0.03569392114877701, + "learning_rate": 5.599098203036935e-06, + "loss": 0.03, + "step": 11360 + }, + { + "epoch": 0.7297759164319927, + "grad_norm": 0.01630323752760887, + "learning_rate": 5.585836482991844e-06, + "loss": 0.0202, + "step": 11370 + }, + { + "epoch": 0.7304177598061633, + "grad_norm": 0.09886395186185837, + "learning_rate": 5.5725747629467545e-06, + "loss": 0.0302, + "step": 11380 + }, + { + "epoch": 0.7310596031803339, + "grad_norm": 0.378889262676239, + "learning_rate": 5.559313042901665e-06, + "loss": 0.03, + "step": 11390 + }, + { + "epoch": 0.7317014465545045, + "grad_norm": 0.40000680088996887, + "learning_rate": 5.546051322856575e-06, + "loss": 0.0276, + "step": 11400 + }, + { + "epoch": 0.7323432899286751, + "grad_norm": 1.454674482345581, + "learning_rate": 5.532789602811485e-06, + "loss": 0.0221, + "step": 11410 + }, + { + "epoch": 0.7329851333028458, + "grad_norm": 0.9703409671783447, + "learning_rate": 5.519527882766396e-06, + "loss": 0.0375, + "step": 11420 + }, + { + "epoch": 0.7336269766770164, + "grad_norm": 5.4596452713012695, + "learning_rate": 5.506266162721305e-06, + "loss": 0.0341, + "step": 11430 + }, + { + "epoch": 0.734268820051187, + "grad_norm": 0.9400603175163269, + "learning_rate": 5.493004442676216e-06, + "loss": 0.0263, + "step": 11440 + }, + { + "epoch": 0.7349106634253576, + "grad_norm": 0.7164040207862854, + "learning_rate": 5.479742722631126e-06, + "loss": 0.0145, + "step": 11450 + }, + { + "epoch": 0.7355525067995282, + "grad_norm": 0.09559702128171921, + "learning_rate": 5.466481002586036e-06, + "loss": 0.0212, + "step": 11460 + }, + { + "epoch": 0.7361943501736988, + "grad_norm": 0.4513958692550659, + "learning_rate": 5.453219282540946e-06, + "loss": 0.0267, + "step": 11470 + }, + { + "epoch": 0.7368361935478694, + "grad_norm": 0.009279868565499783, + "learning_rate": 5.439957562495855e-06, + "loss": 0.0144, + "step": 11480 + }, + { + "epoch": 0.7374780369220401, + "grad_norm": 1.7123267650604248, + "learning_rate": 5.426695842450766e-06, + "loss": 0.0311, + "step": 11490 + }, + { + "epoch": 0.7381198802962107, + "grad_norm": 1.537968397140503, + "learning_rate": 5.413434122405677e-06, + "loss": 0.0332, + "step": 11500 + }, + { + "epoch": 0.7387617236703813, + "grad_norm": 2.3726558685302734, + "learning_rate": 5.400172402360586e-06, + "loss": 0.0282, + "step": 11510 + }, + { + "epoch": 0.7394035670445519, + "grad_norm": 1.8804244995117188, + "learning_rate": 5.386910682315497e-06, + "loss": 0.0279, + "step": 11520 + }, + { + "epoch": 0.7400454104187225, + "grad_norm": 1.1779565811157227, + "learning_rate": 5.373648962270407e-06, + "loss": 0.0281, + "step": 11530 + }, + { + "epoch": 0.7406872537928932, + "grad_norm": 1.5714831352233887, + "learning_rate": 5.360387242225316e-06, + "loss": 0.0244, + "step": 11540 + }, + { + "epoch": 0.7413290971670639, + "grad_norm": 0.5432150363922119, + "learning_rate": 5.3471255221802275e-06, + "loss": 0.0152, + "step": 11550 + }, + { + "epoch": 0.7419709405412345, + "grad_norm": 0.4140097200870514, + "learning_rate": 5.333863802135138e-06, + "loss": 0.0193, + "step": 11560 + }, + { + "epoch": 0.7426127839154051, + "grad_norm": 0.08977576345205307, + "learning_rate": 5.320602082090047e-06, + "loss": 0.0494, + "step": 11570 + }, + { + "epoch": 0.7432546272895757, + "grad_norm": 0.5331017374992371, + "learning_rate": 5.3073403620449585e-06, + "loss": 0.0246, + "step": 11580 + }, + { + "epoch": 0.7438964706637463, + "grad_norm": 0.1366458535194397, + "learning_rate": 5.294078641999868e-06, + "loss": 0.03, + "step": 11590 + }, + { + "epoch": 0.7445383140379169, + "grad_norm": 7.025169849395752, + "learning_rate": 5.2808169219547774e-06, + "loss": 0.0223, + "step": 11600 + }, + { + "epoch": 0.7451801574120875, + "grad_norm": 0.025911634787917137, + "learning_rate": 5.267555201909688e-06, + "loss": 0.0221, + "step": 11610 + }, + { + "epoch": 0.7458220007862582, + "grad_norm": 1.230663776397705, + "learning_rate": 5.254293481864599e-06, + "loss": 0.0248, + "step": 11620 + }, + { + "epoch": 0.7464638441604288, + "grad_norm": 1.829215168952942, + "learning_rate": 5.241031761819508e-06, + "loss": 0.0292, + "step": 11630 + }, + { + "epoch": 0.7471056875345994, + "grad_norm": 1.1922662258148193, + "learning_rate": 5.227770041774418e-06, + "loss": 0.0125, + "step": 11640 + }, + { + "epoch": 0.74774753090877, + "grad_norm": 0.18102401494979858, + "learning_rate": 5.214508321729329e-06, + "loss": 0.0125, + "step": 11650 + }, + { + "epoch": 0.7483893742829406, + "grad_norm": 0.04952188581228256, + "learning_rate": 5.2012466016842385e-06, + "loss": 0.0144, + "step": 11660 + }, + { + "epoch": 0.7490312176571112, + "grad_norm": 0.40653935074806213, + "learning_rate": 5.187984881639149e-06, + "loss": 0.0149, + "step": 11670 + }, + { + "epoch": 0.7496730610312818, + "grad_norm": 2.0297279357910156, + "learning_rate": 5.17472316159406e-06, + "loss": 0.0232, + "step": 11680 + }, + { + "epoch": 0.7503149044054525, + "grad_norm": 1.4835153818130493, + "learning_rate": 5.1614614415489695e-06, + "loss": 0.0114, + "step": 11690 + }, + { + "epoch": 0.7509567477796231, + "grad_norm": 0.1499241292476654, + "learning_rate": 5.148199721503879e-06, + "loss": 0.0109, + "step": 11700 + }, + { + "epoch": 0.7515985911537937, + "grad_norm": 4.053781509399414, + "learning_rate": 5.13493800145879e-06, + "loss": 0.0152, + "step": 11710 + }, + { + "epoch": 0.7522404345279643, + "grad_norm": 0.0353599414229393, + "learning_rate": 5.1216762814137e-06, + "loss": 0.0158, + "step": 11720 + }, + { + "epoch": 0.7528822779021349, + "grad_norm": 0.36769697070121765, + "learning_rate": 5.10841456136861e-06, + "loss": 0.0146, + "step": 11730 + }, + { + "epoch": 0.7535241212763055, + "grad_norm": 1.03465735912323, + "learning_rate": 5.0951528413235194e-06, + "loss": 0.0285, + "step": 11740 + }, + { + "epoch": 0.7541659646504761, + "grad_norm": 0.2146172970533371, + "learning_rate": 5.081891121278431e-06, + "loss": 0.021, + "step": 11750 + }, + { + "epoch": 0.7548078080246468, + "grad_norm": 0.05770499259233475, + "learning_rate": 5.06862940123334e-06, + "loss": 0.0268, + "step": 11760 + }, + { + "epoch": 0.7554496513988174, + "grad_norm": 1.8668345212936401, + "learning_rate": 5.05536768118825e-06, + "loss": 0.0312, + "step": 11770 + }, + { + "epoch": 0.756091494772988, + "grad_norm": 1.0731773376464844, + "learning_rate": 5.042105961143161e-06, + "loss": 0.0298, + "step": 11780 + }, + { + "epoch": 0.7567333381471586, + "grad_norm": 0.1279563009738922, + "learning_rate": 5.028844241098071e-06, + "loss": 0.0183, + "step": 11790 + }, + { + "epoch": 0.7573751815213292, + "grad_norm": 0.774218738079071, + "learning_rate": 5.0155825210529805e-06, + "loss": 0.0139, + "step": 11800 + }, + { + "epoch": 0.7580170248954998, + "grad_norm": 0.9003334045410156, + "learning_rate": 5.002320801007892e-06, + "loss": 0.0223, + "step": 11810 + }, + { + "epoch": 0.7586588682696704, + "grad_norm": 0.5011104941368103, + "learning_rate": 4.989059080962801e-06, + "loss": 0.0193, + "step": 11820 + }, + { + "epoch": 0.7593007116438412, + "grad_norm": 0.020797917619347572, + "learning_rate": 4.9757973609177115e-06, + "loss": 0.0285, + "step": 11830 + }, + { + "epoch": 0.7599425550180118, + "grad_norm": 0.2721422612667084, + "learning_rate": 4.962535640872622e-06, + "loss": 0.0282, + "step": 11840 + }, + { + "epoch": 0.7605843983921824, + "grad_norm": 4.101003170013428, + "learning_rate": 4.949273920827532e-06, + "loss": 0.0191, + "step": 11850 + }, + { + "epoch": 0.761226241766353, + "grad_norm": 0.16916042566299438, + "learning_rate": 4.936012200782442e-06, + "loss": 0.016, + "step": 11860 + }, + { + "epoch": 0.7618680851405236, + "grad_norm": 0.5883210301399231, + "learning_rate": 4.922750480737352e-06, + "loss": 0.0136, + "step": 11870 + }, + { + "epoch": 0.7625099285146942, + "grad_norm": 0.5709671378135681, + "learning_rate": 4.909488760692262e-06, + "loss": 0.0191, + "step": 11880 + }, + { + "epoch": 0.7631517718888648, + "grad_norm": 1.265303611755371, + "learning_rate": 4.896227040647173e-06, + "loss": 0.0156, + "step": 11890 + }, + { + "epoch": 0.7637936152630355, + "grad_norm": 0.45923763513565063, + "learning_rate": 4.882965320602083e-06, + "loss": 0.0299, + "step": 11900 + }, + { + "epoch": 0.7644354586372061, + "grad_norm": 2.511776924133301, + "learning_rate": 4.8697036005569924e-06, + "loss": 0.0202, + "step": 11910 + }, + { + "epoch": 0.7650773020113767, + "grad_norm": 4.2928595542907715, + "learning_rate": 4.856441880511903e-06, + "loss": 0.0342, + "step": 11920 + }, + { + "epoch": 0.7657191453855473, + "grad_norm": 0.0511680543422699, + "learning_rate": 4.843180160466813e-06, + "loss": 0.014, + "step": 11930 + }, + { + "epoch": 0.7663609887597179, + "grad_norm": 0.743169903755188, + "learning_rate": 4.829918440421723e-06, + "loss": 0.0244, + "step": 11940 + }, + { + "epoch": 0.7670028321338885, + "grad_norm": 0.6161398887634277, + "learning_rate": 4.816656720376634e-06, + "loss": 0.0128, + "step": 11950 + }, + { + "epoch": 0.7676446755080591, + "grad_norm": 0.8354362845420837, + "learning_rate": 4.803395000331543e-06, + "loss": 0.0229, + "step": 11960 + }, + { + "epoch": 0.7682865188822298, + "grad_norm": 0.6134613156318665, + "learning_rate": 4.7901332802864535e-06, + "loss": 0.0188, + "step": 11970 + }, + { + "epoch": 0.7689283622564004, + "grad_norm": 0.854161262512207, + "learning_rate": 4.776871560241364e-06, + "loss": 0.019, + "step": 11980 + }, + { + "epoch": 0.769570205630571, + "grad_norm": 1.0251282453536987, + "learning_rate": 4.763609840196274e-06, + "loss": 0.02, + "step": 11990 + }, + { + "epoch": 0.7702120490047416, + "grad_norm": 0.2006016969680786, + "learning_rate": 4.7503481201511845e-06, + "loss": 0.0191, + "step": 12000 + }, + { + "epoch": 0.7708538923789122, + "grad_norm": 2.0554537773132324, + "learning_rate": 4.737086400106094e-06, + "loss": 0.0169, + "step": 12010 + }, + { + "epoch": 0.7714957357530828, + "grad_norm": 0.5309330821037292, + "learning_rate": 4.723824680061004e-06, + "loss": 0.014, + "step": 12020 + }, + { + "epoch": 0.7721375791272534, + "grad_norm": 0.23457559943199158, + "learning_rate": 4.710562960015914e-06, + "loss": 0.0185, + "step": 12030 + }, + { + "epoch": 0.7727794225014241, + "grad_norm": 1.5569202899932861, + "learning_rate": 4.697301239970825e-06, + "loss": 0.0201, + "step": 12040 + }, + { + "epoch": 0.7734212658755947, + "grad_norm": 0.03183150291442871, + "learning_rate": 4.684039519925735e-06, + "loss": 0.0124, + "step": 12050 + }, + { + "epoch": 0.7740631092497653, + "grad_norm": 1.1277698278427124, + "learning_rate": 4.670777799880645e-06, + "loss": 0.0149, + "step": 12060 + }, + { + "epoch": 0.7747049526239359, + "grad_norm": 0.15539094805717468, + "learning_rate": 4.657516079835555e-06, + "loss": 0.0352, + "step": 12070 + }, + { + "epoch": 0.7753467959981065, + "grad_norm": 0.05271551385521889, + "learning_rate": 4.644254359790465e-06, + "loss": 0.0087, + "step": 12080 + }, + { + "epoch": 0.7759886393722771, + "grad_norm": 0.9591747522354126, + "learning_rate": 4.630992639745375e-06, + "loss": 0.0201, + "step": 12090 + }, + { + "epoch": 0.7766304827464477, + "grad_norm": 0.04693359509110451, + "learning_rate": 4.617730919700285e-06, + "loss": 0.027, + "step": 12100 + }, + { + "epoch": 0.7772723261206184, + "grad_norm": 0.5332335233688354, + "learning_rate": 4.6044691996551955e-06, + "loss": 0.0159, + "step": 12110 + }, + { + "epoch": 0.7779141694947891, + "grad_norm": 0.5745136737823486, + "learning_rate": 4.591207479610106e-06, + "loss": 0.0435, + "step": 12120 + }, + { + "epoch": 0.7785560128689597, + "grad_norm": 0.2693396210670471, + "learning_rate": 4.577945759565016e-06, + "loss": 0.0255, + "step": 12130 + }, + { + "epoch": 0.7791978562431303, + "grad_norm": 0.05429879203438759, + "learning_rate": 4.564684039519926e-06, + "loss": 0.0226, + "step": 12140 + }, + { + "epoch": 0.7798396996173009, + "grad_norm": 5.823631286621094, + "learning_rate": 4.551422319474836e-06, + "loss": 0.0164, + "step": 12150 + }, + { + "epoch": 0.7804815429914715, + "grad_norm": 0.052716001868247986, + "learning_rate": 4.538160599429746e-06, + "loss": 0.0231, + "step": 12160 + }, + { + "epoch": 0.7811233863656422, + "grad_norm": 0.6026047468185425, + "learning_rate": 4.524898879384657e-06, + "loss": 0.0085, + "step": 12170 + }, + { + "epoch": 0.7817652297398128, + "grad_norm": 0.8200371861457825, + "learning_rate": 4.511637159339567e-06, + "loss": 0.0269, + "step": 12180 + }, + { + "epoch": 0.7824070731139834, + "grad_norm": 1.132227897644043, + "learning_rate": 4.4983754392944765e-06, + "loss": 0.0489, + "step": 12190 + }, + { + "epoch": 0.783048916488154, + "grad_norm": 0.8593686819076538, + "learning_rate": 4.485113719249387e-06, + "loss": 0.0203, + "step": 12200 + }, + { + "epoch": 0.7836907598623246, + "grad_norm": 1.9194788932800293, + "learning_rate": 4.471851999204297e-06, + "loss": 0.02, + "step": 12210 + }, + { + "epoch": 0.7843326032364952, + "grad_norm": 5.477539539337158, + "learning_rate": 4.4585902791592074e-06, + "loss": 0.0317, + "step": 12220 + }, + { + "epoch": 0.7849744466106658, + "grad_norm": 0.8278726935386658, + "learning_rate": 4.445328559114118e-06, + "loss": 0.0174, + "step": 12230 + }, + { + "epoch": 0.7856162899848365, + "grad_norm": 2.025526762008667, + "learning_rate": 4.432066839069027e-06, + "loss": 0.018, + "step": 12240 + }, + { + "epoch": 0.7862581333590071, + "grad_norm": 0.141770601272583, + "learning_rate": 4.4188051190239376e-06, + "loss": 0.0251, + "step": 12250 + }, + { + "epoch": 0.7868999767331777, + "grad_norm": 0.11860339343547821, + "learning_rate": 4.405543398978848e-06, + "loss": 0.0143, + "step": 12260 + }, + { + "epoch": 0.7875418201073483, + "grad_norm": 1.2235369682312012, + "learning_rate": 4.392281678933758e-06, + "loss": 0.0373, + "step": 12270 + }, + { + "epoch": 0.7881836634815189, + "grad_norm": 0.4883560240268707, + "learning_rate": 4.3790199588886685e-06, + "loss": 0.0307, + "step": 12280 + }, + { + "epoch": 0.7888255068556895, + "grad_norm": 3.3470137119293213, + "learning_rate": 4.365758238843578e-06, + "loss": 0.0108, + "step": 12290 + }, + { + "epoch": 0.7894673502298601, + "grad_norm": 0.5170362591743469, + "learning_rate": 4.352496518798488e-06, + "loss": 0.02, + "step": 12300 + }, + { + "epoch": 0.7901091936040308, + "grad_norm": 0.05397709831595421, + "learning_rate": 4.339234798753399e-06, + "loss": 0.0178, + "step": 12310 + }, + { + "epoch": 0.7907510369782014, + "grad_norm": 2.172746419906616, + "learning_rate": 4.325973078708309e-06, + "loss": 0.0161, + "step": 12320 + }, + { + "epoch": 0.791392880352372, + "grad_norm": 0.5354034900665283, + "learning_rate": 4.312711358663219e-06, + "loss": 0.0215, + "step": 12330 + }, + { + "epoch": 0.7920347237265426, + "grad_norm": 0.03487497940659523, + "learning_rate": 4.299449638618129e-06, + "loss": 0.018, + "step": 12340 + }, + { + "epoch": 0.7926765671007132, + "grad_norm": 0.5163410305976868, + "learning_rate": 4.286187918573039e-06, + "loss": 0.0216, + "step": 12350 + }, + { + "epoch": 0.7933184104748838, + "grad_norm": 2.64249324798584, + "learning_rate": 4.2729261985279494e-06, + "loss": 0.025, + "step": 12360 + }, + { + "epoch": 0.7939602538490544, + "grad_norm": 0.49127480387687683, + "learning_rate": 4.25966447848286e-06, + "loss": 0.0157, + "step": 12370 + }, + { + "epoch": 0.7946020972232251, + "grad_norm": 0.04757243022322655, + "learning_rate": 4.24640275843777e-06, + "loss": 0.0132, + "step": 12380 + }, + { + "epoch": 0.7952439405973957, + "grad_norm": 0.28864434361457825, + "learning_rate": 4.2331410383926796e-06, + "loss": 0.0368, + "step": 12390 + }, + { + "epoch": 0.7958857839715663, + "grad_norm": 2.3907408714294434, + "learning_rate": 4.21987931834759e-06, + "loss": 0.0155, + "step": 12400 + }, + { + "epoch": 0.796527627345737, + "grad_norm": 0.3429820239543915, + "learning_rate": 4.2066175983025e-06, + "loss": 0.0155, + "step": 12410 + }, + { + "epoch": 0.7971694707199076, + "grad_norm": 0.8417637944221497, + "learning_rate": 4.1933558782574105e-06, + "loss": 0.0234, + "step": 12420 + }, + { + "epoch": 0.7978113140940782, + "grad_norm": 2.053241014480591, + "learning_rate": 4.180094158212321e-06, + "loss": 0.0259, + "step": 12430 + }, + { + "epoch": 0.7984531574682489, + "grad_norm": 3.192744731903076, + "learning_rate": 4.16683243816723e-06, + "loss": 0.0431, + "step": 12440 + }, + { + "epoch": 0.7990950008424195, + "grad_norm": 0.042512197047472, + "learning_rate": 4.153570718122141e-06, + "loss": 0.0114, + "step": 12450 + }, + { + "epoch": 0.7997368442165901, + "grad_norm": 0.07811237871646881, + "learning_rate": 4.140308998077051e-06, + "loss": 0.0147, + "step": 12460 + }, + { + "epoch": 0.8003786875907607, + "grad_norm": 0.7863467335700989, + "learning_rate": 4.127047278031961e-06, + "loss": 0.0103, + "step": 12470 + }, + { + "epoch": 0.8010205309649313, + "grad_norm": 0.0853208601474762, + "learning_rate": 4.113785557986872e-06, + "loss": 0.0225, + "step": 12480 + }, + { + "epoch": 0.8016623743391019, + "grad_norm": 0.1661083996295929, + "learning_rate": 4.100523837941782e-06, + "loss": 0.0276, + "step": 12490 + }, + { + "epoch": 0.8023042177132725, + "grad_norm": 0.7978856563568115, + "learning_rate": 4.0872621178966915e-06, + "loss": 0.03, + "step": 12500 + }, + { + "epoch": 0.8029460610874432, + "grad_norm": 5.746982097625732, + "learning_rate": 4.074000397851602e-06, + "loss": 0.0242, + "step": 12510 + }, + { + "epoch": 0.8035879044616138, + "grad_norm": 0.921909511089325, + "learning_rate": 4.060738677806511e-06, + "loss": 0.0114, + "step": 12520 + }, + { + "epoch": 0.8042297478357844, + "grad_norm": 0.4040222465991974, + "learning_rate": 4.0474769577614224e-06, + "loss": 0.0117, + "step": 12530 + }, + { + "epoch": 0.804871591209955, + "grad_norm": 0.5128666758537292, + "learning_rate": 4.034215237716333e-06, + "loss": 0.0222, + "step": 12540 + }, + { + "epoch": 0.8055134345841256, + "grad_norm": 0.8506461977958679, + "learning_rate": 4.020953517671242e-06, + "loss": 0.0242, + "step": 12550 + }, + { + "epoch": 0.8061552779582962, + "grad_norm": 0.9422674179077148, + "learning_rate": 4.0076917976261526e-06, + "loss": 0.0264, + "step": 12560 + }, + { + "epoch": 0.8067971213324668, + "grad_norm": 0.8520615696907043, + "learning_rate": 3.994430077581062e-06, + "loss": 0.0115, + "step": 12570 + }, + { + "epoch": 0.8074389647066375, + "grad_norm": 0.06659626215696335, + "learning_rate": 3.981168357535972e-06, + "loss": 0.0207, + "step": 12580 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 0.9921087026596069, + "learning_rate": 3.9679066374908835e-06, + "loss": 0.0154, + "step": 12590 + }, + { + "epoch": 0.8087226514549787, + "grad_norm": 0.06613270938396454, + "learning_rate": 3.954644917445793e-06, + "loss": 0.0121, + "step": 12600 + }, + { + "epoch": 0.8093644948291493, + "grad_norm": 0.22420759499073029, + "learning_rate": 3.941383197400703e-06, + "loss": 0.0278, + "step": 12610 + }, + { + "epoch": 0.8100063382033199, + "grad_norm": 1.3557567596435547, + "learning_rate": 3.928121477355613e-06, + "loss": 0.0187, + "step": 12620 + }, + { + "epoch": 0.8106481815774905, + "grad_norm": 1.4381176233291626, + "learning_rate": 3.914859757310523e-06, + "loss": 0.022, + "step": 12630 + }, + { + "epoch": 0.8112900249516611, + "grad_norm": 0.49772679805755615, + "learning_rate": 3.9015980372654335e-06, + "loss": 0.0105, + "step": 12640 + }, + { + "epoch": 0.8119318683258318, + "grad_norm": 1.049343466758728, + "learning_rate": 3.888336317220344e-06, + "loss": 0.0229, + "step": 12650 + }, + { + "epoch": 0.8125737117000024, + "grad_norm": 3.795696496963501, + "learning_rate": 3.875074597175254e-06, + "loss": 0.0209, + "step": 12660 + }, + { + "epoch": 0.813215555074173, + "grad_norm": 0.19811248779296875, + "learning_rate": 3.861812877130164e-06, + "loss": 0.0096, + "step": 12670 + }, + { + "epoch": 0.8138573984483436, + "grad_norm": 0.3538595736026764, + "learning_rate": 3.848551157085074e-06, + "loss": 0.0218, + "step": 12680 + }, + { + "epoch": 0.8144992418225142, + "grad_norm": 0.6877108812332153, + "learning_rate": 3.835289437039984e-06, + "loss": 0.0391, + "step": 12690 + }, + { + "epoch": 0.8151410851966849, + "grad_norm": 0.044599391520023346, + "learning_rate": 3.8220277169948946e-06, + "loss": 0.0245, + "step": 12700 + }, + { + "epoch": 0.8157829285708555, + "grad_norm": 0.8634452819824219, + "learning_rate": 3.808765996949805e-06, + "loss": 0.0241, + "step": 12710 + }, + { + "epoch": 0.8164247719450262, + "grad_norm": 0.04034089669585228, + "learning_rate": 3.7955042769047152e-06, + "loss": 0.0278, + "step": 12720 + }, + { + "epoch": 0.8170666153191968, + "grad_norm": 0.028350872918963432, + "learning_rate": 3.782242556859625e-06, + "loss": 0.0108, + "step": 12730 + }, + { + "epoch": 0.8177084586933674, + "grad_norm": 0.8466787934303284, + "learning_rate": 3.7689808368145354e-06, + "loss": 0.0154, + "step": 12740 + }, + { + "epoch": 0.818350302067538, + "grad_norm": 0.27883994579315186, + "learning_rate": 3.755719116769445e-06, + "loss": 0.016, + "step": 12750 + }, + { + "epoch": 0.8189921454417086, + "grad_norm": 2.9189789295196533, + "learning_rate": 3.7424573967243557e-06, + "loss": 0.0158, + "step": 12760 + }, + { + "epoch": 0.8196339888158792, + "grad_norm": 0.6996471881866455, + "learning_rate": 3.729195676679266e-06, + "loss": 0.0141, + "step": 12770 + }, + { + "epoch": 0.8202758321900498, + "grad_norm": 0.2102804332971573, + "learning_rate": 3.7159339566341755e-06, + "loss": 0.0135, + "step": 12780 + }, + { + "epoch": 0.8209176755642205, + "grad_norm": 0.013778652995824814, + "learning_rate": 3.7026722365890862e-06, + "loss": 0.014, + "step": 12790 + }, + { + "epoch": 0.8215595189383911, + "grad_norm": 0.8039547204971313, + "learning_rate": 3.6894105165439957e-06, + "loss": 0.0134, + "step": 12800 + }, + { + "epoch": 0.8222013623125617, + "grad_norm": 3.9042227268218994, + "learning_rate": 3.676148796498906e-06, + "loss": 0.0202, + "step": 12810 + }, + { + "epoch": 0.8228432056867323, + "grad_norm": 0.5314901471138, + "learning_rate": 3.6628870764538168e-06, + "loss": 0.0164, + "step": 12820 + }, + { + "epoch": 0.8234850490609029, + "grad_norm": 0.8493812680244446, + "learning_rate": 3.6496253564087263e-06, + "loss": 0.022, + "step": 12830 + }, + { + "epoch": 0.8241268924350735, + "grad_norm": 1.8056285381317139, + "learning_rate": 3.6363636363636366e-06, + "loss": 0.0339, + "step": 12840 + }, + { + "epoch": 0.8247687358092441, + "grad_norm": 0.9738245606422424, + "learning_rate": 3.6231019163185465e-06, + "loss": 0.0314, + "step": 12850 + }, + { + "epoch": 0.8254105791834148, + "grad_norm": 6.427189350128174, + "learning_rate": 3.609840196273457e-06, + "loss": 0.0307, + "step": 12860 + }, + { + "epoch": 0.8260524225575854, + "grad_norm": 0.38021722435951233, + "learning_rate": 3.596578476228367e-06, + "loss": 0.0084, + "step": 12870 + }, + { + "epoch": 0.826694265931756, + "grad_norm": 0.674687922000885, + "learning_rate": 3.583316756183277e-06, + "loss": 0.021, + "step": 12880 + }, + { + "epoch": 0.8273361093059266, + "grad_norm": 2.1990878582000732, + "learning_rate": 3.5700550361381874e-06, + "loss": 0.0144, + "step": 12890 + }, + { + "epoch": 0.8279779526800972, + "grad_norm": 0.09193415939807892, + "learning_rate": 3.5567933160930973e-06, + "loss": 0.0214, + "step": 12900 + }, + { + "epoch": 0.8286197960542678, + "grad_norm": 0.4719434082508087, + "learning_rate": 3.5435315960480076e-06, + "loss": 0.023, + "step": 12910 + }, + { + "epoch": 0.8292616394284384, + "grad_norm": 0.2065659463405609, + "learning_rate": 3.530269876002918e-06, + "loss": 0.0231, + "step": 12920 + }, + { + "epoch": 0.8299034828026091, + "grad_norm": 0.38893452286720276, + "learning_rate": 3.517008155957828e-06, + "loss": 0.0246, + "step": 12930 + }, + { + "epoch": 0.8305453261767797, + "grad_norm": 1.6100575923919678, + "learning_rate": 3.503746435912738e-06, + "loss": 0.0173, + "step": 12940 + }, + { + "epoch": 0.8311871695509503, + "grad_norm": 0.8549080491065979, + "learning_rate": 3.4904847158676485e-06, + "loss": 0.0355, + "step": 12950 + }, + { + "epoch": 0.8318290129251209, + "grad_norm": 1.1523066759109497, + "learning_rate": 3.4772229958225584e-06, + "loss": 0.026, + "step": 12960 + }, + { + "epoch": 0.8324708562992915, + "grad_norm": 0.8753595352172852, + "learning_rate": 3.4639612757774687e-06, + "loss": 0.0283, + "step": 12970 + }, + { + "epoch": 0.8331126996734621, + "grad_norm": 0.05473419278860092, + "learning_rate": 3.4506995557323786e-06, + "loss": 0.0117, + "step": 12980 + }, + { + "epoch": 0.8337545430476329, + "grad_norm": 0.1066131442785263, + "learning_rate": 3.437437835687289e-06, + "loss": 0.0189, + "step": 12990 + }, + { + "epoch": 0.8343963864218035, + "grad_norm": 1.1916576623916626, + "learning_rate": 3.4241761156421992e-06, + "loss": 0.0299, + "step": 13000 + }, + { + "epoch": 0.8350382297959741, + "grad_norm": 0.3121644854545593, + "learning_rate": 3.410914395597109e-06, + "loss": 0.0301, + "step": 13010 + }, + { + "epoch": 0.8356800731701447, + "grad_norm": 0.024094169959425926, + "learning_rate": 3.3976526755520195e-06, + "loss": 0.0063, + "step": 13020 + }, + { + "epoch": 0.8363219165443153, + "grad_norm": 0.6124148964881897, + "learning_rate": 3.3843909555069294e-06, + "loss": 0.0212, + "step": 13030 + }, + { + "epoch": 0.8369637599184859, + "grad_norm": 4.273738861083984, + "learning_rate": 3.3711292354618397e-06, + "loss": 0.0227, + "step": 13040 + }, + { + "epoch": 0.8376056032926565, + "grad_norm": 0.7729155421257019, + "learning_rate": 3.35786751541675e-06, + "loss": 0.0144, + "step": 13050 + }, + { + "epoch": 0.8382474466668272, + "grad_norm": 0.6459729075431824, + "learning_rate": 3.34460579537166e-06, + "loss": 0.0243, + "step": 13060 + }, + { + "epoch": 0.8388892900409978, + "grad_norm": 0.844441831111908, + "learning_rate": 3.3313440753265703e-06, + "loss": 0.0239, + "step": 13070 + }, + { + "epoch": 0.8395311334151684, + "grad_norm": 0.9636744260787964, + "learning_rate": 3.31808235528148e-06, + "loss": 0.0183, + "step": 13080 + }, + { + "epoch": 0.840172976789339, + "grad_norm": 0.8931861519813538, + "learning_rate": 3.3048206352363905e-06, + "loss": 0.0296, + "step": 13090 + }, + { + "epoch": 0.8408148201635096, + "grad_norm": 0.22203336656093597, + "learning_rate": 3.291558915191301e-06, + "loss": 0.0167, + "step": 13100 + }, + { + "epoch": 0.8414566635376802, + "grad_norm": 1.675729751586914, + "learning_rate": 3.2782971951462107e-06, + "loss": 0.0187, + "step": 13110 + }, + { + "epoch": 0.8420985069118508, + "grad_norm": 0.29589271545410156, + "learning_rate": 3.265035475101121e-06, + "loss": 0.0273, + "step": 13120 + }, + { + "epoch": 0.8427403502860215, + "grad_norm": 2.055467367172241, + "learning_rate": 3.251773755056031e-06, + "loss": 0.0188, + "step": 13130 + }, + { + "epoch": 0.8433821936601921, + "grad_norm": 3.291701078414917, + "learning_rate": 3.2385120350109413e-06, + "loss": 0.0133, + "step": 13140 + }, + { + "epoch": 0.8440240370343627, + "grad_norm": 0.6344001889228821, + "learning_rate": 3.2252503149658516e-06, + "loss": 0.0246, + "step": 13150 + }, + { + "epoch": 0.8446658804085333, + "grad_norm": 0.5008775591850281, + "learning_rate": 3.2119885949207615e-06, + "loss": 0.0297, + "step": 13160 + }, + { + "epoch": 0.8453077237827039, + "grad_norm": 0.3949437439441681, + "learning_rate": 3.198726874875672e-06, + "loss": 0.0275, + "step": 13170 + }, + { + "epoch": 0.8459495671568745, + "grad_norm": 2.543548345565796, + "learning_rate": 3.185465154830582e-06, + "loss": 0.0219, + "step": 13180 + }, + { + "epoch": 0.8465914105310451, + "grad_norm": 0.8906174302101135, + "learning_rate": 3.172203434785492e-06, + "loss": 0.0162, + "step": 13190 + }, + { + "epoch": 0.8472332539052158, + "grad_norm": 7.785815238952637, + "learning_rate": 3.1589417147404024e-06, + "loss": 0.0298, + "step": 13200 + }, + { + "epoch": 0.8478750972793864, + "grad_norm": 5.277609825134277, + "learning_rate": 3.145679994695312e-06, + "loss": 0.0329, + "step": 13210 + }, + { + "epoch": 0.848516940653557, + "grad_norm": 2.5439512729644775, + "learning_rate": 3.1324182746502226e-06, + "loss": 0.0244, + "step": 13220 + }, + { + "epoch": 0.8491587840277276, + "grad_norm": 1.0475541353225708, + "learning_rate": 3.119156554605133e-06, + "loss": 0.0211, + "step": 13230 + }, + { + "epoch": 0.8498006274018982, + "grad_norm": 0.20279256999492645, + "learning_rate": 3.1058948345600424e-06, + "loss": 0.0187, + "step": 13240 + }, + { + "epoch": 0.8504424707760688, + "grad_norm": 1.6604892015457153, + "learning_rate": 3.092633114514953e-06, + "loss": 0.0213, + "step": 13250 + }, + { + "epoch": 0.8510843141502394, + "grad_norm": 0.35537654161453247, + "learning_rate": 3.0793713944698626e-06, + "loss": 0.0362, + "step": 13260 + }, + { + "epoch": 0.85172615752441, + "grad_norm": 1.9353057146072388, + "learning_rate": 3.066109674424773e-06, + "loss": 0.0195, + "step": 13270 + }, + { + "epoch": 0.8523680008985807, + "grad_norm": 0.24915455281734467, + "learning_rate": 3.0528479543796837e-06, + "loss": 0.0239, + "step": 13280 + }, + { + "epoch": 0.8530098442727514, + "grad_norm": 0.3387570381164551, + "learning_rate": 3.039586234334593e-06, + "loss": 0.017, + "step": 13290 + }, + { + "epoch": 0.853651687646922, + "grad_norm": 0.6907055377960205, + "learning_rate": 3.0263245142895035e-06, + "loss": 0.0212, + "step": 13300 + }, + { + "epoch": 0.8542935310210926, + "grad_norm": 0.5599414110183716, + "learning_rate": 3.0130627942444134e-06, + "loss": 0.0354, + "step": 13310 + }, + { + "epoch": 0.8549353743952632, + "grad_norm": 0.10817236453294754, + "learning_rate": 2.9998010741993237e-06, + "loss": 0.0124, + "step": 13320 + }, + { + "epoch": 0.8555772177694339, + "grad_norm": 0.4376680254936218, + "learning_rate": 2.986539354154234e-06, + "loss": 0.0218, + "step": 13330 + }, + { + "epoch": 0.8562190611436045, + "grad_norm": 0.790965735912323, + "learning_rate": 2.973277634109144e-06, + "loss": 0.0219, + "step": 13340 + }, + { + "epoch": 0.8568609045177751, + "grad_norm": 0.57417893409729, + "learning_rate": 2.9600159140640543e-06, + "loss": 0.0179, + "step": 13350 + }, + { + "epoch": 0.8575027478919457, + "grad_norm": 0.20554496347904205, + "learning_rate": 2.946754194018964e-06, + "loss": 0.0215, + "step": 13360 + }, + { + "epoch": 0.8581445912661163, + "grad_norm": 0.1840430051088333, + "learning_rate": 2.9334924739738745e-06, + "loss": 0.0194, + "step": 13370 + }, + { + "epoch": 0.8587864346402869, + "grad_norm": 0.9449292421340942, + "learning_rate": 2.920230753928785e-06, + "loss": 0.0177, + "step": 13380 + }, + { + "epoch": 0.8594282780144575, + "grad_norm": 0.7500403523445129, + "learning_rate": 2.9069690338836947e-06, + "loss": 0.044, + "step": 13390 + }, + { + "epoch": 0.8600701213886282, + "grad_norm": 4.72554349899292, + "learning_rate": 2.893707313838605e-06, + "loss": 0.0221, + "step": 13400 + }, + { + "epoch": 0.8607119647627988, + "grad_norm": 3.8151938915252686, + "learning_rate": 2.8804455937935154e-06, + "loss": 0.0208, + "step": 13410 + }, + { + "epoch": 0.8613538081369694, + "grad_norm": 2.1088836193084717, + "learning_rate": 2.8671838737484253e-06, + "loss": 0.0342, + "step": 13420 + }, + { + "epoch": 0.86199565151114, + "grad_norm": 2.671661138534546, + "learning_rate": 2.8539221537033356e-06, + "loss": 0.0301, + "step": 13430 + }, + { + "epoch": 0.8626374948853106, + "grad_norm": 0.2565511167049408, + "learning_rate": 2.8406604336582455e-06, + "loss": 0.019, + "step": 13440 + }, + { + "epoch": 0.8632793382594812, + "grad_norm": 0.8613461256027222, + "learning_rate": 2.827398713613156e-06, + "loss": 0.0124, + "step": 13450 + }, + { + "epoch": 0.8639211816336518, + "grad_norm": 0.024694399908185005, + "learning_rate": 2.814136993568066e-06, + "loss": 0.0274, + "step": 13460 + }, + { + "epoch": 0.8645630250078224, + "grad_norm": 0.3943841755390167, + "learning_rate": 2.800875273522976e-06, + "loss": 0.0173, + "step": 13470 + }, + { + "epoch": 0.8652048683819931, + "grad_norm": 0.1581508219242096, + "learning_rate": 2.7876135534778864e-06, + "loss": 0.0091, + "step": 13480 + }, + { + "epoch": 0.8658467117561637, + "grad_norm": 0.8672017455101013, + "learning_rate": 2.7743518334327963e-06, + "loss": 0.0178, + "step": 13490 + }, + { + "epoch": 0.8664885551303343, + "grad_norm": 0.06939632445573807, + "learning_rate": 2.7610901133877066e-06, + "loss": 0.0228, + "step": 13500 + }, + { + "epoch": 0.8671303985045049, + "grad_norm": 0.6396413445472717, + "learning_rate": 2.747828393342617e-06, + "loss": 0.0237, + "step": 13510 + }, + { + "epoch": 0.8677722418786755, + "grad_norm": 0.2894430458545685, + "learning_rate": 2.734566673297527e-06, + "loss": 0.012, + "step": 13520 + }, + { + "epoch": 0.8684140852528461, + "grad_norm": 0.6599921584129333, + "learning_rate": 2.721304953252437e-06, + "loss": 0.0208, + "step": 13530 + }, + { + "epoch": 0.8690559286270167, + "grad_norm": 0.015496225096285343, + "learning_rate": 2.708043233207347e-06, + "loss": 0.0197, + "step": 13540 + }, + { + "epoch": 0.8696977720011874, + "grad_norm": 0.09208979457616806, + "learning_rate": 2.6947815131622574e-06, + "loss": 0.021, + "step": 13550 + }, + { + "epoch": 0.870339615375358, + "grad_norm": 0.05079632252454758, + "learning_rate": 2.6815197931171677e-06, + "loss": 0.0154, + "step": 13560 + }, + { + "epoch": 0.8709814587495286, + "grad_norm": 0.5705692172050476, + "learning_rate": 2.6682580730720776e-06, + "loss": 0.0187, + "step": 13570 + }, + { + "epoch": 0.8716233021236993, + "grad_norm": 1.234823226928711, + "learning_rate": 2.654996353026988e-06, + "loss": 0.0292, + "step": 13580 + }, + { + "epoch": 0.8722651454978699, + "grad_norm": 0.5673630237579346, + "learning_rate": 2.641734632981898e-06, + "loss": 0.0275, + "step": 13590 + }, + { + "epoch": 0.8729069888720405, + "grad_norm": 0.007527688052505255, + "learning_rate": 2.628472912936808e-06, + "loss": 0.0218, + "step": 13600 + }, + { + "epoch": 0.8735488322462112, + "grad_norm": 1.007513165473938, + "learning_rate": 2.6152111928917185e-06, + "loss": 0.0129, + "step": 13610 + }, + { + "epoch": 0.8741906756203818, + "grad_norm": 1.1092039346694946, + "learning_rate": 2.6019494728466284e-06, + "loss": 0.0167, + "step": 13620 + }, + { + "epoch": 0.8748325189945524, + "grad_norm": 1.0379366874694824, + "learning_rate": 2.5886877528015387e-06, + "loss": 0.0184, + "step": 13630 + }, + { + "epoch": 0.875474362368723, + "grad_norm": 0.5166342854499817, + "learning_rate": 2.575426032756449e-06, + "loss": 0.0265, + "step": 13640 + }, + { + "epoch": 0.8761162057428936, + "grad_norm": 1.019518256187439, + "learning_rate": 2.562164312711359e-06, + "loss": 0.0188, + "step": 13650 + }, + { + "epoch": 0.8767580491170642, + "grad_norm": 0.8891957402229309, + "learning_rate": 2.5489025926662693e-06, + "loss": 0.0195, + "step": 13660 + }, + { + "epoch": 0.8773998924912348, + "grad_norm": 0.3497367799282074, + "learning_rate": 2.535640872621179e-06, + "loss": 0.0279, + "step": 13670 + }, + { + "epoch": 0.8780417358654055, + "grad_norm": 0.20192396640777588, + "learning_rate": 2.5223791525760895e-06, + "loss": 0.0227, + "step": 13680 + }, + { + "epoch": 0.8786835792395761, + "grad_norm": 0.2110779732465744, + "learning_rate": 2.509117432531e-06, + "loss": 0.0164, + "step": 13690 + }, + { + "epoch": 0.8793254226137467, + "grad_norm": 0.18230389058589935, + "learning_rate": 2.4958557124859093e-06, + "loss": 0.02, + "step": 13700 + }, + { + "epoch": 0.8799672659879173, + "grad_norm": 0.035150956362485886, + "learning_rate": 2.48259399244082e-06, + "loss": 0.022, + "step": 13710 + }, + { + "epoch": 0.8806091093620879, + "grad_norm": 2.311920166015625, + "learning_rate": 2.46933227239573e-06, + "loss": 0.0207, + "step": 13720 + }, + { + "epoch": 0.8812509527362585, + "grad_norm": 0.016587447375059128, + "learning_rate": 2.45607055235064e-06, + "loss": 0.0206, + "step": 13730 + }, + { + "epoch": 0.8818927961104291, + "grad_norm": 0.6804785132408142, + "learning_rate": 2.44280883230555e-06, + "loss": 0.0222, + "step": 13740 + }, + { + "epoch": 0.8825346394845998, + "grad_norm": 0.03751286119222641, + "learning_rate": 2.42954711226046e-06, + "loss": 0.0077, + "step": 13750 + }, + { + "epoch": 0.8831764828587704, + "grad_norm": 0.02884281426668167, + "learning_rate": 2.4162853922153704e-06, + "loss": 0.0218, + "step": 13760 + }, + { + "epoch": 0.883818326232941, + "grad_norm": 0.7173253893852234, + "learning_rate": 2.4030236721702807e-06, + "loss": 0.0301, + "step": 13770 + }, + { + "epoch": 0.8844601696071116, + "grad_norm": 2.145676851272583, + "learning_rate": 2.3897619521251906e-06, + "loss": 0.0274, + "step": 13780 + }, + { + "epoch": 0.8851020129812822, + "grad_norm": 0.00921548716723919, + "learning_rate": 2.376500232080101e-06, + "loss": 0.0207, + "step": 13790 + }, + { + "epoch": 0.8857438563554528, + "grad_norm": 0.2222190648317337, + "learning_rate": 2.3632385120350113e-06, + "loss": 0.0294, + "step": 13800 + }, + { + "epoch": 0.8863856997296234, + "grad_norm": 0.701604962348938, + "learning_rate": 2.349976791989921e-06, + "loss": 0.0127, + "step": 13810 + }, + { + "epoch": 0.8870275431037941, + "grad_norm": 1.019660472869873, + "learning_rate": 2.3367150719448315e-06, + "loss": 0.0241, + "step": 13820 + }, + { + "epoch": 0.8876693864779647, + "grad_norm": 4.041540145874023, + "learning_rate": 2.3234533518997414e-06, + "loss": 0.0181, + "step": 13830 + }, + { + "epoch": 0.8883112298521353, + "grad_norm": 0.7163257598876953, + "learning_rate": 2.3101916318546517e-06, + "loss": 0.0111, + "step": 13840 + }, + { + "epoch": 0.8889530732263059, + "grad_norm": 0.4597417712211609, + "learning_rate": 2.296929911809562e-06, + "loss": 0.0328, + "step": 13850 + }, + { + "epoch": 0.8895949166004765, + "grad_norm": 0.07502664625644684, + "learning_rate": 2.283668191764472e-06, + "loss": 0.0309, + "step": 13860 + }, + { + "epoch": 0.8902367599746472, + "grad_norm": 0.32292598485946655, + "learning_rate": 2.2704064717193823e-06, + "loss": 0.014, + "step": 13870 + }, + { + "epoch": 0.8908786033488179, + "grad_norm": 0.6518355011940002, + "learning_rate": 2.257144751674292e-06, + "loss": 0.0152, + "step": 13880 + }, + { + "epoch": 0.8915204467229885, + "grad_norm": 0.5202192068099976, + "learning_rate": 2.2438830316292025e-06, + "loss": 0.0183, + "step": 13890 + }, + { + "epoch": 0.8921622900971591, + "grad_norm": 0.04122181981801987, + "learning_rate": 2.230621311584113e-06, + "loss": 0.0303, + "step": 13900 + }, + { + "epoch": 0.8928041334713297, + "grad_norm": 0.09100360423326492, + "learning_rate": 2.2173595915390228e-06, + "loss": 0.015, + "step": 13910 + }, + { + "epoch": 0.8934459768455003, + "grad_norm": 0.41373616456985474, + "learning_rate": 2.204097871493933e-06, + "loss": 0.022, + "step": 13920 + }, + { + "epoch": 0.8940878202196709, + "grad_norm": 0.007583377417176962, + "learning_rate": 2.190836151448843e-06, + "loss": 0.013, + "step": 13930 + }, + { + "epoch": 0.8947296635938415, + "grad_norm": 0.731808602809906, + "learning_rate": 2.1775744314037533e-06, + "loss": 0.0154, + "step": 13940 + }, + { + "epoch": 0.8953715069680122, + "grad_norm": 2.2925314903259277, + "learning_rate": 2.1643127113586636e-06, + "loss": 0.022, + "step": 13950 + }, + { + "epoch": 0.8960133503421828, + "grad_norm": 0.034649644047021866, + "learning_rate": 2.1510509913135735e-06, + "loss": 0.0147, + "step": 13960 + }, + { + "epoch": 0.8966551937163534, + "grad_norm": 0.47291600704193115, + "learning_rate": 2.1377892712684834e-06, + "loss": 0.01, + "step": 13970 + }, + { + "epoch": 0.897297037090524, + "grad_norm": 7.475317001342773, + "learning_rate": 2.1245275512233938e-06, + "loss": 0.0187, + "step": 13980 + }, + { + "epoch": 0.8979388804646946, + "grad_norm": 0.4022504389286041, + "learning_rate": 2.111265831178304e-06, + "loss": 0.0194, + "step": 13990 + }, + { + "epoch": 0.8985807238388652, + "grad_norm": 0.17653189599514008, + "learning_rate": 2.098004111133214e-06, + "loss": 0.029, + "step": 14000 + }, + { + "epoch": 0.8992225672130358, + "grad_norm": 2.2175135612487793, + "learning_rate": 2.0847423910881243e-06, + "loss": 0.0499, + "step": 14010 + }, + { + "epoch": 0.8998644105872065, + "grad_norm": 0.11678501218557358, + "learning_rate": 2.0714806710430342e-06, + "loss": 0.015, + "step": 14020 + }, + { + "epoch": 0.9005062539613771, + "grad_norm": 2.840829849243164, + "learning_rate": 2.0582189509979445e-06, + "loss": 0.032, + "step": 14030 + }, + { + "epoch": 0.9011480973355477, + "grad_norm": 3.5912017822265625, + "learning_rate": 2.044957230952855e-06, + "loss": 0.0145, + "step": 14040 + }, + { + "epoch": 0.9017899407097183, + "grad_norm": 0.4161267876625061, + "learning_rate": 2.0316955109077648e-06, + "loss": 0.0111, + "step": 14050 + }, + { + "epoch": 0.9024317840838889, + "grad_norm": 2.903442621231079, + "learning_rate": 2.018433790862675e-06, + "loss": 0.0216, + "step": 14060 + }, + { + "epoch": 0.9030736274580595, + "grad_norm": 2.279383420944214, + "learning_rate": 2.005172070817585e-06, + "loss": 0.0265, + "step": 14070 + }, + { + "epoch": 0.9037154708322301, + "grad_norm": 6.031994342803955, + "learning_rate": 1.9919103507724953e-06, + "loss": 0.0236, + "step": 14080 + }, + { + "epoch": 0.9043573142064008, + "grad_norm": 1.7201488018035889, + "learning_rate": 1.9786486307274056e-06, + "loss": 0.0318, + "step": 14090 + }, + { + "epoch": 0.9049991575805714, + "grad_norm": 0.8770641684532166, + "learning_rate": 1.9653869106823155e-06, + "loss": 0.0129, + "step": 14100 + }, + { + "epoch": 0.905641000954742, + "grad_norm": 0.23001974821090698, + "learning_rate": 1.952125190637226e-06, + "loss": 0.0192, + "step": 14110 + }, + { + "epoch": 0.9062828443289126, + "grad_norm": 2.5015745162963867, + "learning_rate": 1.938863470592136e-06, + "loss": 0.0173, + "step": 14120 + }, + { + "epoch": 0.9069246877030832, + "grad_norm": 0.13634377717971802, + "learning_rate": 1.925601750547046e-06, + "loss": 0.0292, + "step": 14130 + }, + { + "epoch": 0.9075665310772538, + "grad_norm": 1.0855636596679688, + "learning_rate": 1.9123400305019564e-06, + "loss": 0.0187, + "step": 14140 + }, + { + "epoch": 0.9082083744514244, + "grad_norm": 0.20524372160434723, + "learning_rate": 1.8990783104568663e-06, + "loss": 0.0153, + "step": 14150 + }, + { + "epoch": 0.9088502178255952, + "grad_norm": 0.9702785611152649, + "learning_rate": 1.8858165904117764e-06, + "loss": 0.0243, + "step": 14160 + }, + { + "epoch": 0.9094920611997658, + "grad_norm": 0.0294529739767313, + "learning_rate": 1.8725548703666868e-06, + "loss": 0.0296, + "step": 14170 + }, + { + "epoch": 0.9101339045739364, + "grad_norm": 1.1509523391723633, + "learning_rate": 1.8592931503215969e-06, + "loss": 0.0264, + "step": 14180 + }, + { + "epoch": 0.910775747948107, + "grad_norm": 0.44765394926071167, + "learning_rate": 1.846031430276507e-06, + "loss": 0.0185, + "step": 14190 + }, + { + "epoch": 0.9114175913222776, + "grad_norm": 0.05895282328128815, + "learning_rate": 1.832769710231417e-06, + "loss": 0.0239, + "step": 14200 + }, + { + "epoch": 0.9120594346964482, + "grad_norm": 0.01649278774857521, + "learning_rate": 1.8195079901863272e-06, + "loss": 0.0216, + "step": 14210 + }, + { + "epoch": 0.9127012780706188, + "grad_norm": 0.04506438225507736, + "learning_rate": 1.8062462701412375e-06, + "loss": 0.0172, + "step": 14220 + }, + { + "epoch": 0.9133431214447895, + "grad_norm": 0.364442378282547, + "learning_rate": 1.7929845500961477e-06, + "loss": 0.0346, + "step": 14230 + }, + { + "epoch": 0.9139849648189601, + "grad_norm": 0.5869020223617554, + "learning_rate": 1.7797228300510578e-06, + "loss": 0.0207, + "step": 14240 + }, + { + "epoch": 0.9146268081931307, + "grad_norm": 0.6192695498466492, + "learning_rate": 1.7664611100059679e-06, + "loss": 0.012, + "step": 14250 + }, + { + "epoch": 0.9152686515673013, + "grad_norm": 0.13403886556625366, + "learning_rate": 1.7531993899608782e-06, + "loss": 0.0269, + "step": 14260 + }, + { + "epoch": 0.9159104949414719, + "grad_norm": 7.840417385101318, + "learning_rate": 1.7399376699157883e-06, + "loss": 0.0236, + "step": 14270 + }, + { + "epoch": 0.9165523383156425, + "grad_norm": 0.1548127382993698, + "learning_rate": 1.7266759498706984e-06, + "loss": 0.0379, + "step": 14280 + }, + { + "epoch": 0.9171941816898131, + "grad_norm": 0.05730681121349335, + "learning_rate": 1.7134142298256085e-06, + "loss": 0.0151, + "step": 14290 + }, + { + "epoch": 0.9178360250639838, + "grad_norm": 2.952178955078125, + "learning_rate": 1.7001525097805185e-06, + "loss": 0.0124, + "step": 14300 + }, + { + "epoch": 0.9184778684381544, + "grad_norm": 0.4452185332775116, + "learning_rate": 1.686890789735429e-06, + "loss": 0.0208, + "step": 14310 + }, + { + "epoch": 0.919119711812325, + "grad_norm": 0.05769308656454086, + "learning_rate": 1.673629069690339e-06, + "loss": 0.0225, + "step": 14320 + }, + { + "epoch": 0.9197615551864956, + "grad_norm": 0.30611440539360046, + "learning_rate": 1.660367349645249e-06, + "loss": 0.0153, + "step": 14330 + }, + { + "epoch": 0.9204033985606662, + "grad_norm": 4.835531234741211, + "learning_rate": 1.6471056296001591e-06, + "loss": 0.0267, + "step": 14340 + }, + { + "epoch": 0.9210452419348368, + "grad_norm": 6.112522602081299, + "learning_rate": 1.6338439095550697e-06, + "loss": 0.0235, + "step": 14350 + }, + { + "epoch": 0.9216870853090074, + "grad_norm": 0.17406339943408966, + "learning_rate": 1.6205821895099796e-06, + "loss": 0.0303, + "step": 14360 + }, + { + "epoch": 0.9223289286831781, + "grad_norm": 0.049876339733600616, + "learning_rate": 1.6073204694648897e-06, + "loss": 0.0132, + "step": 14370 + }, + { + "epoch": 0.9229707720573487, + "grad_norm": 0.455746591091156, + "learning_rate": 1.5940587494197998e-06, + "loss": 0.0104, + "step": 14380 + }, + { + "epoch": 0.9236126154315193, + "grad_norm": 0.1712111383676529, + "learning_rate": 1.58079702937471e-06, + "loss": 0.0189, + "step": 14390 + }, + { + "epoch": 0.9242544588056899, + "grad_norm": 1.3781015872955322, + "learning_rate": 1.5675353093296202e-06, + "loss": 0.0215, + "step": 14400 + }, + { + "epoch": 0.9248963021798605, + "grad_norm": 0.18812592327594757, + "learning_rate": 1.5542735892845303e-06, + "loss": 0.0259, + "step": 14410 + }, + { + "epoch": 0.9255381455540311, + "grad_norm": 0.16782377660274506, + "learning_rate": 1.5410118692394404e-06, + "loss": 0.0327, + "step": 14420 + }, + { + "epoch": 0.9261799889282017, + "grad_norm": 0.07186288386583328, + "learning_rate": 1.5277501491943506e-06, + "loss": 0.0182, + "step": 14430 + }, + { + "epoch": 0.9268218323023724, + "grad_norm": 0.7693184018135071, + "learning_rate": 1.5144884291492609e-06, + "loss": 0.0235, + "step": 14440 + }, + { + "epoch": 0.9274636756765431, + "grad_norm": 0.5924553275108337, + "learning_rate": 1.501226709104171e-06, + "loss": 0.0281, + "step": 14450 + }, + { + "epoch": 0.9281055190507137, + "grad_norm": 0.031777169555425644, + "learning_rate": 1.4879649890590811e-06, + "loss": 0.0155, + "step": 14460 + }, + { + "epoch": 0.9287473624248843, + "grad_norm": 0.011459583416581154, + "learning_rate": 1.4747032690139912e-06, + "loss": 0.0345, + "step": 14470 + }, + { + "epoch": 0.9293892057990549, + "grad_norm": 0.35816437005996704, + "learning_rate": 1.4614415489689013e-06, + "loss": 0.0273, + "step": 14480 + }, + { + "epoch": 0.9300310491732255, + "grad_norm": 0.29088306427001953, + "learning_rate": 1.4481798289238117e-06, + "loss": 0.0264, + "step": 14490 + }, + { + "epoch": 0.9306728925473962, + "grad_norm": 0.4196803569793701, + "learning_rate": 1.4349181088787218e-06, + "loss": 0.0053, + "step": 14500 + }, + { + "epoch": 0.9313147359215668, + "grad_norm": 0.1188807487487793, + "learning_rate": 1.4216563888336319e-06, + "loss": 0.0238, + "step": 14510 + }, + { + "epoch": 0.9319565792957374, + "grad_norm": 1.8770768642425537, + "learning_rate": 1.408394668788542e-06, + "loss": 0.0292, + "step": 14520 + }, + { + "epoch": 0.932598422669908, + "grad_norm": 0.8623146414756775, + "learning_rate": 1.395132948743452e-06, + "loss": 0.0105, + "step": 14530 + }, + { + "epoch": 0.9332402660440786, + "grad_norm": 0.8951078057289124, + "learning_rate": 1.3818712286983624e-06, + "loss": 0.0246, + "step": 14540 + }, + { + "epoch": 0.9338821094182492, + "grad_norm": 0.2718936800956726, + "learning_rate": 1.3686095086532726e-06, + "loss": 0.0123, + "step": 14550 + }, + { + "epoch": 0.9345239527924198, + "grad_norm": 0.025857457891106606, + "learning_rate": 1.3553477886081825e-06, + "loss": 0.0129, + "step": 14560 + }, + { + "epoch": 0.9351657961665905, + "grad_norm": 0.4448324143886566, + "learning_rate": 1.3420860685630926e-06, + "loss": 0.0206, + "step": 14570 + }, + { + "epoch": 0.9358076395407611, + "grad_norm": 0.21557696163654327, + "learning_rate": 1.3288243485180031e-06, + "loss": 0.0345, + "step": 14580 + }, + { + "epoch": 0.9364494829149317, + "grad_norm": 0.2089679092168808, + "learning_rate": 1.315562628472913e-06, + "loss": 0.0187, + "step": 14590 + }, + { + "epoch": 0.9370913262891023, + "grad_norm": 2.0699353218078613, + "learning_rate": 1.3023009084278231e-06, + "loss": 0.0217, + "step": 14600 + }, + { + "epoch": 0.9377331696632729, + "grad_norm": 2.9068894386291504, + "learning_rate": 1.2890391883827332e-06, + "loss": 0.0247, + "step": 14610 + }, + { + "epoch": 0.9383750130374435, + "grad_norm": 1.3320043087005615, + "learning_rate": 1.2757774683376434e-06, + "loss": 0.0106, + "step": 14620 + }, + { + "epoch": 0.9390168564116141, + "grad_norm": 2.405679702758789, + "learning_rate": 1.2625157482925537e-06, + "loss": 0.0189, + "step": 14630 + }, + { + "epoch": 0.9396586997857848, + "grad_norm": 0.5062136650085449, + "learning_rate": 1.2492540282474638e-06, + "loss": 0.0228, + "step": 14640 + }, + { + "epoch": 0.9403005431599554, + "grad_norm": 1.138634204864502, + "learning_rate": 1.235992308202374e-06, + "loss": 0.0163, + "step": 14650 + }, + { + "epoch": 0.940942386534126, + "grad_norm": 0.49596962332725525, + "learning_rate": 1.2227305881572842e-06, + "loss": 0.0198, + "step": 14660 + }, + { + "epoch": 0.9415842299082966, + "grad_norm": 0.5795552730560303, + "learning_rate": 1.2094688681121943e-06, + "loss": 0.0145, + "step": 14670 + }, + { + "epoch": 0.9422260732824672, + "grad_norm": 1.0133116245269775, + "learning_rate": 1.1962071480671042e-06, + "loss": 0.0144, + "step": 14680 + }, + { + "epoch": 0.9428679166566378, + "grad_norm": 0.3072698712348938, + "learning_rate": 1.1829454280220146e-06, + "loss": 0.0223, + "step": 14690 + }, + { + "epoch": 0.9435097600308084, + "grad_norm": 3.633915901184082, + "learning_rate": 1.1696837079769247e-06, + "loss": 0.0249, + "step": 14700 + }, + { + "epoch": 0.9441516034049791, + "grad_norm": 1.0597184896469116, + "learning_rate": 1.1564219879318348e-06, + "loss": 0.014, + "step": 14710 + }, + { + "epoch": 0.9447934467791497, + "grad_norm": 1.0187700986862183, + "learning_rate": 1.143160267886745e-06, + "loss": 0.0198, + "step": 14720 + }, + { + "epoch": 0.9454352901533203, + "grad_norm": 0.5023682117462158, + "learning_rate": 1.1298985478416552e-06, + "loss": 0.0311, + "step": 14730 + }, + { + "epoch": 0.9460771335274909, + "grad_norm": 2.0689022541046143, + "learning_rate": 1.1166368277965654e-06, + "loss": 0.0197, + "step": 14740 + }, + { + "epoch": 0.9467189769016616, + "grad_norm": 7.048704147338867, + "learning_rate": 1.1033751077514755e-06, + "loss": 0.0235, + "step": 14750 + }, + { + "epoch": 0.9473608202758322, + "grad_norm": 0.6049743890762329, + "learning_rate": 1.0901133877063856e-06, + "loss": 0.0213, + "step": 14760 + }, + { + "epoch": 0.9480026636500029, + "grad_norm": 1.2432732582092285, + "learning_rate": 1.0768516676612957e-06, + "loss": 0.0244, + "step": 14770 + }, + { + "epoch": 0.9486445070241735, + "grad_norm": 1.8908295631408691, + "learning_rate": 1.063589947616206e-06, + "loss": 0.0304, + "step": 14780 + }, + { + "epoch": 0.9492863503983441, + "grad_norm": 0.16690313816070557, + "learning_rate": 1.050328227571116e-06, + "loss": 0.0116, + "step": 14790 + }, + { + "epoch": 0.9499281937725147, + "grad_norm": 1.6157116889953613, + "learning_rate": 1.0370665075260262e-06, + "loss": 0.0251, + "step": 14800 + }, + { + "epoch": 0.9505700371466853, + "grad_norm": 0.6109140515327454, + "learning_rate": 1.0238047874809364e-06, + "loss": 0.0253, + "step": 14810 + }, + { + "epoch": 0.9512118805208559, + "grad_norm": 1.494762659072876, + "learning_rate": 1.0105430674358465e-06, + "loss": 0.0138, + "step": 14820 + }, + { + "epoch": 0.9518537238950265, + "grad_norm": 0.47034692764282227, + "learning_rate": 9.972813473907566e-07, + "loss": 0.0162, + "step": 14830 + }, + { + "epoch": 0.9524955672691972, + "grad_norm": 0.22934949398040771, + "learning_rate": 9.840196273456667e-07, + "loss": 0.024, + "step": 14840 + }, + { + "epoch": 0.9531374106433678, + "grad_norm": 0.8337307572364807, + "learning_rate": 9.70757907300577e-07, + "loss": 0.0251, + "step": 14850 + }, + { + "epoch": 0.9537792540175384, + "grad_norm": 2.82429575920105, + "learning_rate": 9.574961872554871e-07, + "loss": 0.0198, + "step": 14860 + }, + { + "epoch": 0.954421097391709, + "grad_norm": 0.22207744419574738, + "learning_rate": 9.442344672103972e-07, + "loss": 0.0143, + "step": 14870 + }, + { + "epoch": 0.9550629407658796, + "grad_norm": 0.013904299587011337, + "learning_rate": 9.309727471653074e-07, + "loss": 0.0214, + "step": 14880 + }, + { + "epoch": 0.9557047841400502, + "grad_norm": 7.341753959655762, + "learning_rate": 9.177110271202176e-07, + "loss": 0.0161, + "step": 14890 + }, + { + "epoch": 0.9563466275142208, + "grad_norm": 0.29478883743286133, + "learning_rate": 9.044493070751277e-07, + "loss": 0.0157, + "step": 14900 + }, + { + "epoch": 0.9569884708883915, + "grad_norm": 0.01080330554395914, + "learning_rate": 8.911875870300379e-07, + "loss": 0.0217, + "step": 14910 + }, + { + "epoch": 0.9576303142625621, + "grad_norm": 0.024894973263144493, + "learning_rate": 8.77925866984948e-07, + "loss": 0.0208, + "step": 14920 + }, + { + "epoch": 0.9582721576367327, + "grad_norm": 5.743207931518555, + "learning_rate": 8.646641469398581e-07, + "loss": 0.0209, + "step": 14930 + }, + { + "epoch": 0.9589140010109033, + "grad_norm": 0.9241563677787781, + "learning_rate": 8.514024268947684e-07, + "loss": 0.0315, + "step": 14940 + }, + { + "epoch": 0.9595558443850739, + "grad_norm": 0.06609442830085754, + "learning_rate": 8.381407068496785e-07, + "loss": 0.0153, + "step": 14950 + }, + { + "epoch": 0.9601976877592445, + "grad_norm": 0.5711029767990112, + "learning_rate": 8.248789868045887e-07, + "loss": 0.0254, + "step": 14960 + }, + { + "epoch": 0.9608395311334151, + "grad_norm": 3.806605100631714, + "learning_rate": 8.116172667594988e-07, + "loss": 0.0291, + "step": 14970 + }, + { + "epoch": 0.9614813745075858, + "grad_norm": 0.03836154565215111, + "learning_rate": 7.98355546714409e-07, + "loss": 0.0177, + "step": 14980 + }, + { + "epoch": 0.9621232178817564, + "grad_norm": 1.2240588665008545, + "learning_rate": 7.85093826669319e-07, + "loss": 0.0102, + "step": 14990 + }, + { + "epoch": 0.962765061255927, + "grad_norm": 1.0644761323928833, + "learning_rate": 7.718321066242291e-07, + "loss": 0.0426, + "step": 15000 + }, + { + "epoch": 0.9634069046300976, + "grad_norm": 0.48844975233078003, + "learning_rate": 7.585703865791394e-07, + "loss": 0.0362, + "step": 15010 + }, + { + "epoch": 0.9640487480042682, + "grad_norm": 1.1872748136520386, + "learning_rate": 7.453086665340495e-07, + "loss": 0.0327, + "step": 15020 + }, + { + "epoch": 0.9646905913784388, + "grad_norm": 1.2432113885879517, + "learning_rate": 7.320469464889597e-07, + "loss": 0.0236, + "step": 15030 + }, + { + "epoch": 0.9653324347526095, + "grad_norm": 0.12339625507593155, + "learning_rate": 7.187852264438698e-07, + "loss": 0.0213, + "step": 15040 + }, + { + "epoch": 0.9659742781267802, + "grad_norm": 0.8140659332275391, + "learning_rate": 7.0552350639878e-07, + "loss": 0.0208, + "step": 15050 + }, + { + "epoch": 0.9666161215009508, + "grad_norm": 0.10417089611291885, + "learning_rate": 6.922617863536901e-07, + "loss": 0.0185, + "step": 15060 + }, + { + "epoch": 0.9672579648751214, + "grad_norm": 1.1666573286056519, + "learning_rate": 6.790000663086003e-07, + "loss": 0.016, + "step": 15070 + }, + { + "epoch": 0.967899808249292, + "grad_norm": 2.4468142986297607, + "learning_rate": 6.657383462635105e-07, + "loss": 0.0177, + "step": 15080 + }, + { + "epoch": 0.9685416516234626, + "grad_norm": 0.02574695274233818, + "learning_rate": 6.524766262184205e-07, + "loss": 0.0127, + "step": 15090 + }, + { + "epoch": 0.9691834949976332, + "grad_norm": 0.8955485224723816, + "learning_rate": 6.392149061733308e-07, + "loss": 0.0222, + "step": 15100 + }, + { + "epoch": 0.9698253383718038, + "grad_norm": 0.622428297996521, + "learning_rate": 6.259531861282408e-07, + "loss": 0.0133, + "step": 15110 + }, + { + "epoch": 0.9704671817459745, + "grad_norm": 0.1865653395652771, + "learning_rate": 6.12691466083151e-07, + "loss": 0.0222, + "step": 15120 + }, + { + "epoch": 0.9711090251201451, + "grad_norm": 0.1449609100818634, + "learning_rate": 5.994297460380612e-07, + "loss": 0.0369, + "step": 15130 + }, + { + "epoch": 0.9717508684943157, + "grad_norm": 1.6518489122390747, + "learning_rate": 5.861680259929714e-07, + "loss": 0.0306, + "step": 15140 + }, + { + "epoch": 0.9723927118684863, + "grad_norm": 0.9899628758430481, + "learning_rate": 5.729063059478815e-07, + "loss": 0.0112, + "step": 15150 + }, + { + "epoch": 0.9730345552426569, + "grad_norm": 0.517170786857605, + "learning_rate": 5.596445859027916e-07, + "loss": 0.0186, + "step": 15160 + }, + { + "epoch": 0.9736763986168275, + "grad_norm": 0.008162170648574829, + "learning_rate": 5.463828658577018e-07, + "loss": 0.0054, + "step": 15170 + }, + { + "epoch": 0.9743182419909981, + "grad_norm": 0.7698886394500732, + "learning_rate": 5.331211458126119e-07, + "loss": 0.0282, + "step": 15180 + }, + { + "epoch": 0.9749600853651688, + "grad_norm": 4.941947937011719, + "learning_rate": 5.19859425767522e-07, + "loss": 0.0326, + "step": 15190 + }, + { + "epoch": 0.9756019287393394, + "grad_norm": 0.18007877469062805, + "learning_rate": 5.065977057224323e-07, + "loss": 0.0252, + "step": 15200 + }, + { + "epoch": 0.97624377211351, + "grad_norm": 0.21914184093475342, + "learning_rate": 4.933359856773424e-07, + "loss": 0.0333, + "step": 15210 + }, + { + "epoch": 0.9768856154876806, + "grad_norm": 0.45805874466896057, + "learning_rate": 4.800742656322525e-07, + "loss": 0.0149, + "step": 15220 + }, + { + "epoch": 0.9775274588618512, + "grad_norm": 6.29951810836792, + "learning_rate": 4.668125455871627e-07, + "loss": 0.0216, + "step": 15230 + }, + { + "epoch": 0.9781693022360218, + "grad_norm": 0.764335036277771, + "learning_rate": 4.535508255420729e-07, + "loss": 0.017, + "step": 15240 + }, + { + "epoch": 0.9788111456101924, + "grad_norm": 10.674691200256348, + "learning_rate": 4.40289105496983e-07, + "loss": 0.0288, + "step": 15250 + }, + { + "epoch": 0.9794529889843631, + "grad_norm": 0.6469502449035645, + "learning_rate": 4.2702738545189316e-07, + "loss": 0.0089, + "step": 15260 + }, + { + "epoch": 0.9800948323585337, + "grad_norm": 1.4636619091033936, + "learning_rate": 4.1376566540680327e-07, + "loss": 0.014, + "step": 15270 + }, + { + "epoch": 0.9807366757327043, + "grad_norm": 1.0619542598724365, + "learning_rate": 4.0050394536171344e-07, + "loss": 0.0175, + "step": 15280 + }, + { + "epoch": 0.9813785191068749, + "grad_norm": 0.06426899135112762, + "learning_rate": 3.872422253166236e-07, + "loss": 0.0124, + "step": 15290 + }, + { + "epoch": 0.9820203624810455, + "grad_norm": 8.639874458312988, + "learning_rate": 3.7398050527153377e-07, + "loss": 0.0284, + "step": 15300 + }, + { + "epoch": 0.9826622058552161, + "grad_norm": 0.9990230798721313, + "learning_rate": 3.607187852264439e-07, + "loss": 0.0224, + "step": 15310 + }, + { + "epoch": 0.9833040492293867, + "grad_norm": 3.5814836025238037, + "learning_rate": 3.4745706518135405e-07, + "loss": 0.0232, + "step": 15320 + }, + { + "epoch": 0.9839458926035575, + "grad_norm": 0.4112697243690491, + "learning_rate": 3.341953451362642e-07, + "loss": 0.0243, + "step": 15330 + }, + { + "epoch": 0.9845877359777281, + "grad_norm": 1.136162281036377, + "learning_rate": 3.209336250911744e-07, + "loss": 0.0056, + "step": 15340 + }, + { + "epoch": 0.9852295793518987, + "grad_norm": 0.06789114326238632, + "learning_rate": 3.076719050460845e-07, + "loss": 0.007, + "step": 15350 + }, + { + "epoch": 0.9858714227260693, + "grad_norm": 0.027609622105956078, + "learning_rate": 2.944101850009946e-07, + "loss": 0.0214, + "step": 15360 + }, + { + "epoch": 0.9865132661002399, + "grad_norm": 0.472529798746109, + "learning_rate": 2.811484649559048e-07, + "loss": 0.0295, + "step": 15370 + }, + { + "epoch": 0.9871551094744105, + "grad_norm": 1.0125441551208496, + "learning_rate": 2.6788674491081494e-07, + "loss": 0.0255, + "step": 15380 + }, + { + "epoch": 0.9877969528485812, + "grad_norm": 0.11004765331745148, + "learning_rate": 2.546250248657251e-07, + "loss": 0.021, + "step": 15390 + }, + { + "epoch": 0.9884387962227518, + "grad_norm": 2.7606465816497803, + "learning_rate": 2.413633048206352e-07, + "loss": 0.0242, + "step": 15400 + }, + { + "epoch": 0.9890806395969224, + "grad_norm": 0.45687106251716614, + "learning_rate": 2.281015847755454e-07, + "loss": 0.027, + "step": 15410 + }, + { + "epoch": 0.989722482971093, + "grad_norm": 1.2439097166061401, + "learning_rate": 2.1483986473045556e-07, + "loss": 0.0398, + "step": 15420 + }, + { + "epoch": 0.9903643263452636, + "grad_norm": 0.4050452709197998, + "learning_rate": 2.0157814468536572e-07, + "loss": 0.0135, + "step": 15430 + }, + { + "epoch": 0.9910061697194342, + "grad_norm": 4.158045768737793, + "learning_rate": 1.8831642464027584e-07, + "loss": 0.0178, + "step": 15440 + }, + { + "epoch": 0.9916480130936048, + "grad_norm": 0.40229663252830505, + "learning_rate": 1.75054704595186e-07, + "loss": 0.0175, + "step": 15450 + }, + { + "epoch": 0.9922898564677755, + "grad_norm": 0.27971166372299194, + "learning_rate": 1.6179298455009617e-07, + "loss": 0.0279, + "step": 15460 + }, + { + "epoch": 0.9929316998419461, + "grad_norm": 0.02092203125357628, + "learning_rate": 1.485312645050063e-07, + "loss": 0.016, + "step": 15470 + }, + { + "epoch": 0.9935735432161167, + "grad_norm": 0.5914480090141296, + "learning_rate": 1.3526954445991648e-07, + "loss": 0.0353, + "step": 15480 + }, + { + "epoch": 0.9942153865902873, + "grad_norm": 0.27921462059020996, + "learning_rate": 1.2200782441482662e-07, + "loss": 0.0139, + "step": 15490 + }, + { + "epoch": 0.9948572299644579, + "grad_norm": 0.5014231204986572, + "learning_rate": 1.0874610436973677e-07, + "loss": 0.0165, + "step": 15500 + }, + { + "epoch": 0.9954990733386285, + "grad_norm": 0.6527778506278992, + "learning_rate": 9.548438432464691e-08, + "loss": 0.0321, + "step": 15510 + }, + { + "epoch": 0.9961409167127991, + "grad_norm": 0.018357345834374428, + "learning_rate": 8.222266427955706e-08, + "loss": 0.0248, + "step": 15520 + }, + { + "epoch": 0.9967827600869698, + "grad_norm": 0.30090826749801636, + "learning_rate": 6.896094423446722e-08, + "loss": 0.0076, + "step": 15530 + }, + { + "epoch": 0.9974246034611404, + "grad_norm": 1.3387514352798462, + "learning_rate": 5.569922418937737e-08, + "loss": 0.0229, + "step": 15540 + }, + { + "epoch": 0.998066446835311, + "grad_norm": 2.0242581367492676, + "learning_rate": 4.2437504144287516e-08, + "loss": 0.0123, + "step": 15550 + }, + { + "epoch": 0.9987082902094816, + "grad_norm": 1.3312128782272339, + "learning_rate": 2.917578409919767e-08, + "loss": 0.0135, + "step": 15560 + }, + { + "epoch": 0.9993501335836522, + "grad_norm": 0.10261988639831543, + "learning_rate": 1.591406405410782e-08, + "loss": 0.0214, + "step": 15570 + }, + { + "epoch": 0.9999919769578228, + "grad_norm": 0.041646864265203476, + "learning_rate": 2.6523440090179698e-09, + "loss": 0.0197, + "step": 15580 + } + ], + "logging_steps": 10, + "max_steps": 15581, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}