{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9987138263665596, "eval_steps": 2000, "global_step": 1554, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012861736334405145, "grad_norm": 9.707831382751465, "learning_rate": 1.0000000000000002e-06, "loss": 25.452, "step": 1 }, { "epoch": 0.012861736334405145, "grad_norm": 11.00273323059082, "learning_rate": 1e-05, "loss": 26.4058, "step": 10 }, { "epoch": 0.02572347266881029, "grad_norm": 8.362554550170898, "learning_rate": 1.9e-05, "loss": 25.8934, "step": 20 }, { "epoch": 0.03858520900321544, "grad_norm": 7.939857482910156, "learning_rate": 2.9e-05, "loss": 24.5436, "step": 30 }, { "epoch": 0.05144694533762058, "grad_norm": 6.404581069946289, "learning_rate": 3.9000000000000006e-05, "loss": 23.0916, "step": 40 }, { "epoch": 0.06430868167202572, "grad_norm": 6.355580806732178, "learning_rate": 4.9e-05, "loss": 22.1792, "step": 50 }, { "epoch": 0.07717041800643087, "grad_norm": 7.588299751281738, "learning_rate": 5.9e-05, "loss": 21.8915, "step": 60 }, { "epoch": 0.09003215434083602, "grad_norm": 7.4134674072265625, "learning_rate": 6.9e-05, "loss": 21.4667, "step": 70 }, { "epoch": 0.10289389067524116, "grad_norm": 8.687100410461426, "learning_rate": 7.900000000000001e-05, "loss": 20.9957, "step": 80 }, { "epoch": 0.1157556270096463, "grad_norm": 7.489386081695557, "learning_rate": 8.900000000000001e-05, "loss": 20.3597, "step": 90 }, { "epoch": 0.12861736334405144, "grad_norm": 9.844521522521973, "learning_rate": 9.900000000000001e-05, "loss": 20.0669, "step": 100 }, { "epoch": 0.1414790996784566, "grad_norm": 9.825691223144531, "learning_rate": 9.938101788170564e-05, "loss": 20.2982, "step": 110 }, { "epoch": 0.15434083601286175, "grad_norm": 9.795324325561523, "learning_rate": 9.869325997248969e-05, "loss": 20.561, "step": 120 }, { "epoch": 0.16720257234726688, "grad_norm": 8.644824028015137, "learning_rate": 9.800550206327373e-05, "loss": 20.3877, "step": 130 }, { "epoch": 0.18006430868167203, "grad_norm": 8.671817779541016, "learning_rate": 9.731774415405778e-05, "loss": 20.1549, "step": 140 }, { "epoch": 0.19292604501607716, "grad_norm": 7.58090353012085, "learning_rate": 9.662998624484182e-05, "loss": 19.8608, "step": 150 }, { "epoch": 0.2057877813504823, "grad_norm": 7.75922155380249, "learning_rate": 9.594222833562586e-05, "loss": 19.7351, "step": 160 }, { "epoch": 0.21864951768488747, "grad_norm": 8.19886589050293, "learning_rate": 9.525447042640991e-05, "loss": 20.0158, "step": 170 }, { "epoch": 0.2315112540192926, "grad_norm": 7.9521403312683105, "learning_rate": 9.456671251719396e-05, "loss": 19.8263, "step": 180 }, { "epoch": 0.24437299035369775, "grad_norm": 7.865631580352783, "learning_rate": 9.3878954607978e-05, "loss": 19.5188, "step": 190 }, { "epoch": 0.2572347266881029, "grad_norm": 7.940866470336914, "learning_rate": 9.319119669876204e-05, "loss": 19.0615, "step": 200 }, { "epoch": 0.27009646302250806, "grad_norm": 8.591438293457031, "learning_rate": 9.250343878954608e-05, "loss": 19.8423, "step": 210 }, { "epoch": 0.2829581993569132, "grad_norm": 9.395312309265137, "learning_rate": 9.181568088033012e-05, "loss": 19.7645, "step": 220 }, { "epoch": 0.2958199356913183, "grad_norm": 9.619457244873047, "learning_rate": 9.112792297111418e-05, "loss": 19.5452, "step": 230 }, { "epoch": 0.3086816720257235, "grad_norm": 11.119356155395508, "learning_rate": 9.044016506189821e-05, "loss": 18.7142, "step": 240 }, { "epoch": 0.3215434083601286, "grad_norm": 8.78004264831543, "learning_rate": 8.975240715268226e-05, "loss": 19.8931, "step": 250 }, { "epoch": 0.33440514469453375, "grad_norm": 7.693414688110352, "learning_rate": 8.906464924346631e-05, "loss": 19.1463, "step": 260 }, { "epoch": 0.34726688102893893, "grad_norm": 8.954816818237305, "learning_rate": 8.837689133425034e-05, "loss": 19.4928, "step": 270 }, { "epoch": 0.36012861736334406, "grad_norm": 9.582860946655273, "learning_rate": 8.768913342503439e-05, "loss": 19.4513, "step": 280 }, { "epoch": 0.3729903536977492, "grad_norm": 10.099853515625, "learning_rate": 8.700137551581843e-05, "loss": 19.5683, "step": 290 }, { "epoch": 0.3858520900321543, "grad_norm": 8.16080093383789, "learning_rate": 8.631361760660248e-05, "loss": 19.2572, "step": 300 }, { "epoch": 0.3987138263665595, "grad_norm": 8.541340827941895, "learning_rate": 8.562585969738653e-05, "loss": 19.6704, "step": 310 }, { "epoch": 0.4115755627009646, "grad_norm": 8.608294486999512, "learning_rate": 8.493810178817056e-05, "loss": 19.5447, "step": 320 }, { "epoch": 0.42443729903536975, "grad_norm": 9.346860885620117, "learning_rate": 8.425034387895461e-05, "loss": 18.8019, "step": 330 }, { "epoch": 0.43729903536977494, "grad_norm": 8.101990699768066, "learning_rate": 8.356258596973866e-05, "loss": 19.2198, "step": 340 }, { "epoch": 0.45016077170418006, "grad_norm": 8.572087287902832, "learning_rate": 8.28748280605227e-05, "loss": 19.5076, "step": 350 }, { "epoch": 0.4630225080385852, "grad_norm": 8.421360969543457, "learning_rate": 8.218707015130674e-05, "loss": 19.4962, "step": 360 }, { "epoch": 0.4758842443729904, "grad_norm": 8.405660629272461, "learning_rate": 8.149931224209079e-05, "loss": 19.2253, "step": 370 }, { "epoch": 0.4887459807073955, "grad_norm": 9.72239875793457, "learning_rate": 8.081155433287483e-05, "loss": 19.2721, "step": 380 }, { "epoch": 0.5016077170418006, "grad_norm": 8.438865661621094, "learning_rate": 8.012379642365888e-05, "loss": 18.9673, "step": 390 }, { "epoch": 0.5144694533762058, "grad_norm": 7.178921222686768, "learning_rate": 7.943603851444293e-05, "loss": 19.0314, "step": 400 }, { "epoch": 0.5273311897106109, "grad_norm": 8.454991340637207, "learning_rate": 7.874828060522696e-05, "loss": 19.1222, "step": 410 }, { "epoch": 0.5401929260450161, "grad_norm": 9.644460678100586, "learning_rate": 7.806052269601101e-05, "loss": 19.5513, "step": 420 }, { "epoch": 0.5530546623794212, "grad_norm": 7.690568447113037, "learning_rate": 7.737276478679505e-05, "loss": 19.611, "step": 430 }, { "epoch": 0.5659163987138264, "grad_norm": 8.506192207336426, "learning_rate": 7.668500687757909e-05, "loss": 19.2757, "step": 440 }, { "epoch": 0.5787781350482315, "grad_norm": 8.91247272491455, "learning_rate": 7.599724896836315e-05, "loss": 19.2419, "step": 450 }, { "epoch": 0.5916398713826366, "grad_norm": 7.841221809387207, "learning_rate": 7.530949105914718e-05, "loss": 19.0954, "step": 460 }, { "epoch": 0.6045016077170418, "grad_norm": 8.165250778198242, "learning_rate": 7.462173314993123e-05, "loss": 19.2586, "step": 470 }, { "epoch": 0.617363344051447, "grad_norm": 8.386945724487305, "learning_rate": 7.393397524071526e-05, "loss": 19.1699, "step": 480 }, { "epoch": 0.6302250803858521, "grad_norm": 8.344170570373535, "learning_rate": 7.324621733149931e-05, "loss": 19.2644, "step": 490 }, { "epoch": 0.6430868167202572, "grad_norm": 8.671243667602539, "learning_rate": 7.255845942228337e-05, "loss": 18.9184, "step": 500 }, { "epoch": 0.6559485530546624, "grad_norm": 10.449199676513672, "learning_rate": 7.18707015130674e-05, "loss": 19.1155, "step": 510 }, { "epoch": 0.6688102893890675, "grad_norm": 9.303428649902344, "learning_rate": 7.118294360385145e-05, "loss": 19.4957, "step": 520 }, { "epoch": 0.6816720257234726, "grad_norm": 9.450081825256348, "learning_rate": 7.049518569463549e-05, "loss": 19.0008, "step": 530 }, { "epoch": 0.6945337620578779, "grad_norm": 8.767317771911621, "learning_rate": 6.980742778541953e-05, "loss": 18.7919, "step": 540 }, { "epoch": 0.707395498392283, "grad_norm": 8.295973777770996, "learning_rate": 6.911966987620358e-05, "loss": 18.9279, "step": 550 }, { "epoch": 0.7202572347266881, "grad_norm": 8.042985916137695, "learning_rate": 6.843191196698763e-05, "loss": 18.7261, "step": 560 }, { "epoch": 0.7331189710610932, "grad_norm": 8.96065902709961, "learning_rate": 6.774415405777167e-05, "loss": 18.8108, "step": 570 }, { "epoch": 0.7459807073954984, "grad_norm": 8.509966850280762, "learning_rate": 6.705639614855571e-05, "loss": 18.8661, "step": 580 }, { "epoch": 0.7588424437299035, "grad_norm": 8.156121253967285, "learning_rate": 6.636863823933975e-05, "loss": 18.9892, "step": 590 }, { "epoch": 0.7717041800643086, "grad_norm": 8.182650566101074, "learning_rate": 6.56808803301238e-05, "loss": 19.0869, "step": 600 }, { "epoch": 0.7845659163987139, "grad_norm": 9.14923095703125, "learning_rate": 6.499312242090785e-05, "loss": 19.0666, "step": 610 }, { "epoch": 0.797427652733119, "grad_norm": 9.078425407409668, "learning_rate": 6.43053645116919e-05, "loss": 19.0009, "step": 620 }, { "epoch": 0.8102893890675241, "grad_norm": 10.448148727416992, "learning_rate": 6.361760660247593e-05, "loss": 19.2314, "step": 630 }, { "epoch": 0.8231511254019293, "grad_norm": 8.34157943725586, "learning_rate": 6.292984869325998e-05, "loss": 19.0115, "step": 640 }, { "epoch": 0.8360128617363344, "grad_norm": 8.367515563964844, "learning_rate": 6.224209078404401e-05, "loss": 18.8488, "step": 650 }, { "epoch": 0.8488745980707395, "grad_norm": 9.3355073928833, "learning_rate": 6.155433287482807e-05, "loss": 18.5198, "step": 660 }, { "epoch": 0.8617363344051447, "grad_norm": 9.879706382751465, "learning_rate": 6.086657496561211e-05, "loss": 19.0761, "step": 670 }, { "epoch": 0.8745980707395499, "grad_norm": 8.172968864440918, "learning_rate": 6.017881705639615e-05, "loss": 19.1141, "step": 680 }, { "epoch": 0.887459807073955, "grad_norm": 8.10605525970459, "learning_rate": 5.949105914718019e-05, "loss": 18.9529, "step": 690 }, { "epoch": 0.9003215434083601, "grad_norm": 9.074273109436035, "learning_rate": 5.880330123796424e-05, "loss": 18.7229, "step": 700 }, { "epoch": 0.9131832797427653, "grad_norm": 8.10617733001709, "learning_rate": 5.811554332874828e-05, "loss": 18.9373, "step": 710 }, { "epoch": 0.9260450160771704, "grad_norm": 9.200577735900879, "learning_rate": 5.7427785419532334e-05, "loss": 18.9493, "step": 720 }, { "epoch": 0.9389067524115756, "grad_norm": 8.880610466003418, "learning_rate": 5.6740027510316374e-05, "loss": 18.772, "step": 730 }, { "epoch": 0.9517684887459807, "grad_norm": 8.134041786193848, "learning_rate": 5.6052269601100415e-05, "loss": 18.9767, "step": 740 }, { "epoch": 0.9646302250803859, "grad_norm": 8.776124000549316, "learning_rate": 5.543328748280605e-05, "loss": 18.3181, "step": 750 }, { "epoch": 0.977491961414791, "grad_norm": 9.204455375671387, "learning_rate": 5.4745529573590104e-05, "loss": 19.0893, "step": 760 }, { "epoch": 0.9903536977491961, "grad_norm": 8.519386291503906, "learning_rate": 5.4057771664374145e-05, "loss": 19.1596, "step": 770 }, { "epoch": 1.0032154340836013, "grad_norm": 8.442768096923828, "learning_rate": 5.3370013755158185e-05, "loss": 18.3393, "step": 780 }, { "epoch": 1.0160771704180065, "grad_norm": 8.598028182983398, "learning_rate": 5.268225584594223e-05, "loss": 17.5667, "step": 790 }, { "epoch": 1.0289389067524115, "grad_norm": 11.272820472717285, "learning_rate": 5.199449793672627e-05, "loss": 17.5853, "step": 800 }, { "epoch": 1.0418006430868167, "grad_norm": 10.125863075256348, "learning_rate": 5.130674002751031e-05, "loss": 18.1351, "step": 810 }, { "epoch": 1.0546623794212218, "grad_norm": 10.486109733581543, "learning_rate": 5.061898211829437e-05, "loss": 17.4027, "step": 820 }, { "epoch": 1.067524115755627, "grad_norm": 10.920713424682617, "learning_rate": 4.993122420907841e-05, "loss": 17.9291, "step": 830 }, { "epoch": 1.0803858520900322, "grad_norm": 9.404900550842285, "learning_rate": 4.924346629986245e-05, "loss": 17.7988, "step": 840 }, { "epoch": 1.0932475884244373, "grad_norm": 9.127197265625, "learning_rate": 4.8555708390646495e-05, "loss": 18.108, "step": 850 }, { "epoch": 1.1061093247588425, "grad_norm": 10.35006332397461, "learning_rate": 4.786795048143054e-05, "loss": 18.016, "step": 860 }, { "epoch": 1.1189710610932475, "grad_norm": 9.213512420654297, "learning_rate": 4.718019257221458e-05, "loss": 18.1557, "step": 870 }, { "epoch": 1.1318327974276527, "grad_norm": 11.302148818969727, "learning_rate": 4.649243466299862e-05, "loss": 17.766, "step": 880 }, { "epoch": 1.144694533762058, "grad_norm": 9.329822540283203, "learning_rate": 4.580467675378267e-05, "loss": 18.2947, "step": 890 }, { "epoch": 1.157556270096463, "grad_norm": 9.3578519821167, "learning_rate": 4.511691884456672e-05, "loss": 17.9609, "step": 900 }, { "epoch": 1.1704180064308682, "grad_norm": 9.759269714355469, "learning_rate": 4.4429160935350764e-05, "loss": 18.1176, "step": 910 }, { "epoch": 1.1832797427652733, "grad_norm": 9.538485527038574, "learning_rate": 4.3741403026134804e-05, "loss": 18.5584, "step": 920 }, { "epoch": 1.1961414790996785, "grad_norm": 11.227431297302246, "learning_rate": 4.3053645116918845e-05, "loss": 18.0436, "step": 930 }, { "epoch": 1.2090032154340835, "grad_norm": 10.502628326416016, "learning_rate": 4.236588720770289e-05, "loss": 18.0714, "step": 940 }, { "epoch": 1.2218649517684887, "grad_norm": 10.603680610656738, "learning_rate": 4.167812929848693e-05, "loss": 17.8879, "step": 950 }, { "epoch": 1.234726688102894, "grad_norm": 10.099736213684082, "learning_rate": 4.099037138927098e-05, "loss": 17.8487, "step": 960 }, { "epoch": 1.247588424437299, "grad_norm": 12.383750915527344, "learning_rate": 4.0302613480055027e-05, "loss": 18.4906, "step": 970 }, { "epoch": 1.2604501607717042, "grad_norm": 9.282815933227539, "learning_rate": 3.961485557083907e-05, "loss": 18.1795, "step": 980 }, { "epoch": 1.2733118971061093, "grad_norm": 10.828181266784668, "learning_rate": 3.8927097661623114e-05, "loss": 17.6422, "step": 990 }, { "epoch": 1.2861736334405145, "grad_norm": 10.479561805725098, "learning_rate": 3.8239339752407154e-05, "loss": 18.145, "step": 1000 }, { "epoch": 1.2990353697749195, "grad_norm": 10.827543258666992, "learning_rate": 3.7551581843191195e-05, "loss": 18.669, "step": 1010 }, { "epoch": 1.3118971061093248, "grad_norm": 9.661266326904297, "learning_rate": 3.686382393397524e-05, "loss": 17.7707, "step": 1020 }, { "epoch": 1.32475884244373, "grad_norm": 11.319337844848633, "learning_rate": 3.617606602475929e-05, "loss": 17.9766, "step": 1030 }, { "epoch": 1.337620578778135, "grad_norm": 10.7207670211792, "learning_rate": 3.548830811554333e-05, "loss": 17.7066, "step": 1040 }, { "epoch": 1.3504823151125402, "grad_norm": 10.781481742858887, "learning_rate": 3.480055020632738e-05, "loss": 18.2417, "step": 1050 }, { "epoch": 1.3633440514469453, "grad_norm": 9.606935501098633, "learning_rate": 3.411279229711142e-05, "loss": 17.5177, "step": 1060 }, { "epoch": 1.3762057877813505, "grad_norm": 10.399476051330566, "learning_rate": 3.342503438789546e-05, "loss": 18.0005, "step": 1070 }, { "epoch": 1.3890675241157555, "grad_norm": 14.43407917022705, "learning_rate": 3.2737276478679505e-05, "loss": 17.8236, "step": 1080 }, { "epoch": 1.4019292604501608, "grad_norm": 10.456531524658203, "learning_rate": 3.204951856946355e-05, "loss": 17.5033, "step": 1090 }, { "epoch": 1.414790996784566, "grad_norm": 10.075111389160156, "learning_rate": 3.13617606602476e-05, "loss": 18.1669, "step": 1100 }, { "epoch": 1.427652733118971, "grad_norm": 11.425241470336914, "learning_rate": 3.067400275103164e-05, "loss": 18.1447, "step": 1110 }, { "epoch": 1.4405144694533762, "grad_norm": 10.018535614013672, "learning_rate": 2.998624484181568e-05, "loss": 18.0492, "step": 1120 }, { "epoch": 1.4533762057877815, "grad_norm": 10.587430953979492, "learning_rate": 2.9298486932599727e-05, "loss": 17.812, "step": 1130 }, { "epoch": 1.4662379421221865, "grad_norm": 10.457320213317871, "learning_rate": 2.861072902338377e-05, "loss": 17.4634, "step": 1140 }, { "epoch": 1.4790996784565915, "grad_norm": 11.961458206176758, "learning_rate": 2.792297111416781e-05, "loss": 17.8976, "step": 1150 }, { "epoch": 1.4919614147909968, "grad_norm": 9.497962951660156, "learning_rate": 2.7235213204951858e-05, "loss": 18.2895, "step": 1160 }, { "epoch": 1.504823151125402, "grad_norm": 11.496424674987793, "learning_rate": 2.6547455295735902e-05, "loss": 17.9459, "step": 1170 }, { "epoch": 1.517684887459807, "grad_norm": 11.947159767150879, "learning_rate": 2.585969738651995e-05, "loss": 17.6389, "step": 1180 }, { "epoch": 1.5305466237942122, "grad_norm": 9.540078163146973, "learning_rate": 2.517193947730399e-05, "loss": 17.5585, "step": 1190 }, { "epoch": 1.5434083601286175, "grad_norm": 11.378511428833008, "learning_rate": 2.4484181568088037e-05, "loss": 17.9126, "step": 1200 }, { "epoch": 1.5562700964630225, "grad_norm": 10.624009132385254, "learning_rate": 2.3796423658872077e-05, "loss": 17.5224, "step": 1210 }, { "epoch": 1.5691318327974275, "grad_norm": 10.94864273071289, "learning_rate": 2.310866574965612e-05, "loss": 17.6583, "step": 1220 }, { "epoch": 1.5819935691318328, "grad_norm": 11.377878189086914, "learning_rate": 2.2420907840440168e-05, "loss": 18.2857, "step": 1230 }, { "epoch": 1.594855305466238, "grad_norm": 10.72612476348877, "learning_rate": 2.173314993122421e-05, "loss": 17.7727, "step": 1240 }, { "epoch": 1.607717041800643, "grad_norm": 10.036147117614746, "learning_rate": 2.1045392022008252e-05, "loss": 18.1126, "step": 1250 }, { "epoch": 1.6205787781350482, "grad_norm": 11.862444877624512, "learning_rate": 2.03576341127923e-05, "loss": 18.0734, "step": 1260 }, { "epoch": 1.6334405144694535, "grad_norm": 10.716621398925781, "learning_rate": 1.9669876203576343e-05, "loss": 18.0464, "step": 1270 }, { "epoch": 1.6463022508038585, "grad_norm": 11.006821632385254, "learning_rate": 1.8982118294360383e-05, "loss": 17.7383, "step": 1280 }, { "epoch": 1.6591639871382635, "grad_norm": 10.550652503967285, "learning_rate": 1.829436038514443e-05, "loss": 17.961, "step": 1290 }, { "epoch": 1.6720257234726688, "grad_norm": 12.875121116638184, "learning_rate": 1.7606602475928474e-05, "loss": 18.1224, "step": 1300 }, { "epoch": 1.684887459807074, "grad_norm": 10.083199501037598, "learning_rate": 1.6918844566712518e-05, "loss": 18.0676, "step": 1310 }, { "epoch": 1.697749196141479, "grad_norm": 10.942306518554688, "learning_rate": 1.6231086657496562e-05, "loss": 17.6347, "step": 1320 }, { "epoch": 1.7106109324758842, "grad_norm": 10.71388053894043, "learning_rate": 1.5543328748280606e-05, "loss": 17.8142, "step": 1330 }, { "epoch": 1.7234726688102895, "grad_norm": 10.833003997802734, "learning_rate": 1.4855570839064651e-05, "loss": 17.2463, "step": 1340 }, { "epoch": 1.7363344051446945, "grad_norm": 9.758099555969238, "learning_rate": 1.4167812929848695e-05, "loss": 17.603, "step": 1350 }, { "epoch": 1.7491961414790995, "grad_norm": 11.835128784179688, "learning_rate": 1.3480055020632737e-05, "loss": 17.6819, "step": 1360 }, { "epoch": 1.762057877813505, "grad_norm": 12.568058967590332, "learning_rate": 1.2792297111416782e-05, "loss": 18.0653, "step": 1370 }, { "epoch": 1.77491961414791, "grad_norm": 11.193774223327637, "learning_rate": 1.2104539202200826e-05, "loss": 17.9026, "step": 1380 }, { "epoch": 1.787781350482315, "grad_norm": 11.404935836791992, "learning_rate": 1.141678129298487e-05, "loss": 17.6447, "step": 1390 }, { "epoch": 1.8006430868167203, "grad_norm": 11.925581932067871, "learning_rate": 1.0729023383768915e-05, "loss": 17.991, "step": 1400 }, { "epoch": 1.8135048231511255, "grad_norm": 10.404480934143066, "learning_rate": 1.0041265474552957e-05, "loss": 17.6332, "step": 1410 }, { "epoch": 1.8263665594855305, "grad_norm": 11.522180557250977, "learning_rate": 9.353507565337003e-06, "loss": 17.4805, "step": 1420 }, { "epoch": 1.8392282958199357, "grad_norm": 10.101544380187988, "learning_rate": 8.665749656121047e-06, "loss": 17.74, "step": 1430 }, { "epoch": 1.852090032154341, "grad_norm": 10.94338321685791, "learning_rate": 7.977991746905089e-06, "loss": 17.8962, "step": 1440 }, { "epoch": 1.864951768488746, "grad_norm": 10.872537612915039, "learning_rate": 7.290233837689134e-06, "loss": 17.3997, "step": 1450 }, { "epoch": 1.877813504823151, "grad_norm": 13.044309616088867, "learning_rate": 6.602475928473177e-06, "loss": 17.6202, "step": 1460 }, { "epoch": 1.8906752411575563, "grad_norm": 11.46693229675293, "learning_rate": 5.914718019257222e-06, "loss": 17.2325, "step": 1470 }, { "epoch": 1.9035369774919615, "grad_norm": 11.1300048828125, "learning_rate": 5.226960110041266e-06, "loss": 17.7343, "step": 1480 }, { "epoch": 1.9163987138263665, "grad_norm": 11.025785446166992, "learning_rate": 4.53920220082531e-06, "loss": 17.7074, "step": 1490 }, { "epoch": 1.9292604501607717, "grad_norm": 12.155854225158691, "learning_rate": 3.851444291609354e-06, "loss": 17.6288, "step": 1500 }, { "epoch": 1.942122186495177, "grad_norm": 9.896262168884277, "learning_rate": 3.163686382393398e-06, "loss": 17.9447, "step": 1510 }, { "epoch": 1.954983922829582, "grad_norm": 9.653135299682617, "learning_rate": 2.4759284731774417e-06, "loss": 18.0259, "step": 1520 }, { "epoch": 1.967845659163987, "grad_norm": 11.20612907409668, "learning_rate": 1.7881705639614857e-06, "loss": 18.1078, "step": 1530 }, { "epoch": 1.9807073954983923, "grad_norm": 12.505290985107422, "learning_rate": 1.1004126547455296e-06, "loss": 17.5399, "step": 1540 }, { "epoch": 1.9935691318327975, "grad_norm": 11.760818481445312, "learning_rate": 4.1265474552957356e-07, "loss": 18.1392, "step": 1550 } ], "logging_steps": 10, "max_steps": 1554, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.700603965563208e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }