| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9977046671767407, | |
| "eval_steps": 500, | |
| "global_step": 1959, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.015302218821729151, | |
| "grad_norm": 2.9595158525528062, | |
| "learning_rate": 1.0204081632653061e-05, | |
| "loss": 1.7025, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.030604437643458302, | |
| "grad_norm": 1.263751808272248, | |
| "learning_rate": 2.0408163265306123e-05, | |
| "loss": 1.4094, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.045906656465187455, | |
| "grad_norm": 1.1314499693827325, | |
| "learning_rate": 3.061224489795919e-05, | |
| "loss": 1.0742, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.061208875286916604, | |
| "grad_norm": 0.43429385320294256, | |
| "learning_rate": 4.0816326530612245e-05, | |
| "loss": 0.8975, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.07651109410864575, | |
| "grad_norm": 0.37910893323587813, | |
| "learning_rate": 5.102040816326531e-05, | |
| "loss": 0.7694, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.09181331293037491, | |
| "grad_norm": 0.32985782659377816, | |
| "learning_rate": 6.122448979591838e-05, | |
| "loss": 0.6667, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.10711553175210406, | |
| "grad_norm": 0.27993689719153514, | |
| "learning_rate": 7.142857142857143e-05, | |
| "loss": 0.6143, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.12241775057383321, | |
| "grad_norm": 0.2562350918748776, | |
| "learning_rate": 8.163265306122449e-05, | |
| "loss": 0.5736, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.13771996939556236, | |
| "grad_norm": 0.2627549443187762, | |
| "learning_rate": 9.183673469387756e-05, | |
| "loss": 0.5605, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1530221882172915, | |
| "grad_norm": 0.25450163445897056, | |
| "learning_rate": 0.00010204081632653062, | |
| "loss": 0.5425, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16832440703902066, | |
| "grad_norm": 0.265185130228763, | |
| "learning_rate": 0.00011224489795918367, | |
| "loss": 0.5417, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.18362662586074982, | |
| "grad_norm": 0.2600738826510011, | |
| "learning_rate": 0.00012244897959183676, | |
| "loss": 0.5349, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.19892884468247896, | |
| "grad_norm": 0.27451283782332153, | |
| "learning_rate": 0.0001326530612244898, | |
| "loss": 0.5225, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.21423106350420812, | |
| "grad_norm": 0.29276216948080613, | |
| "learning_rate": 0.00014285714285714287, | |
| "loss": 0.5148, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.22953328232593725, | |
| "grad_norm": 0.25373117999248507, | |
| "learning_rate": 0.0001530612244897959, | |
| "loss": 0.5108, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.24483550114766642, | |
| "grad_norm": 0.26479254050664824, | |
| "learning_rate": 0.00016326530612244898, | |
| "loss": 0.5061, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.26013771996939555, | |
| "grad_norm": 0.27232811822105624, | |
| "learning_rate": 0.00017346938775510205, | |
| "loss": 0.5065, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.2754399387911247, | |
| "grad_norm": 0.26418337252250673, | |
| "learning_rate": 0.00018367346938775512, | |
| "loss": 0.4967, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.2907421576128539, | |
| "grad_norm": 0.2503450300406005, | |
| "learning_rate": 0.00019387755102040816, | |
| "loss": 0.4998, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.306044376434583, | |
| "grad_norm": 0.2339540170507543, | |
| "learning_rate": 0.0001999974597101728, | |
| "loss": 0.4989, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.32134659525631215, | |
| "grad_norm": 0.23446730330407825, | |
| "learning_rate": 0.0001999688829317862, | |
| "loss": 0.4895, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.3366488140780413, | |
| "grad_norm": 0.23685036500213205, | |
| "learning_rate": 0.00019990856311693857, | |
| "loss": 0.4898, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.3519510328997705, | |
| "grad_norm": 0.2396529148643123, | |
| "learning_rate": 0.00019981651941893068, | |
| "loss": 0.4881, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.36725325172149964, | |
| "grad_norm": 0.22794906865901965, | |
| "learning_rate": 0.0001996927810643216, | |
| "loss": 0.4825, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.38255547054322875, | |
| "grad_norm": 0.21749849325438408, | |
| "learning_rate": 0.00019953738734364843, | |
| "loss": 0.4905, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3978576893649579, | |
| "grad_norm": 0.21941306857106413, | |
| "learning_rate": 0.00019935038759895038, | |
| "loss": 0.4844, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.4131599081866871, | |
| "grad_norm": 0.2106883472247568, | |
| "learning_rate": 0.0001991318412081012, | |
| "loss": 0.4832, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.42846212700841624, | |
| "grad_norm": 0.20643151525949935, | |
| "learning_rate": 0.00019888181756595513, | |
| "loss": 0.4732, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.44376434583014535, | |
| "grad_norm": 0.21375448358620805, | |
| "learning_rate": 0.0001986003960623118, | |
| "loss": 0.4777, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.4590665646518745, | |
| "grad_norm": 0.2078386748706879, | |
| "learning_rate": 0.0001982876660567078, | |
| "loss": 0.4773, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.4743687834736037, | |
| "grad_norm": 0.20709767688640485, | |
| "learning_rate": 0.00019794372685004232, | |
| "loss": 0.4774, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.48967100229533284, | |
| "grad_norm": 0.20114204358521104, | |
| "learning_rate": 0.00019756868765304637, | |
| "loss": 0.4724, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.504973221117062, | |
| "grad_norm": 0.2047995080216857, | |
| "learning_rate": 0.000197162667551605, | |
| "loss": 0.4723, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.5202754399387911, | |
| "grad_norm": 0.2065404899680565, | |
| "learning_rate": 0.00019672579546894418, | |
| "loss": 0.4744, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.5355776587605203, | |
| "grad_norm": 0.20339328597837086, | |
| "learning_rate": 0.00019625821012469392, | |
| "loss": 0.469, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.5508798775822494, | |
| "grad_norm": 0.20269051264889382, | |
| "learning_rate": 0.0001957600599908406, | |
| "loss": 0.4652, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.5661820964039785, | |
| "grad_norm": 0.20140204074578216, | |
| "learning_rate": 0.00019523150324458297, | |
| "loss": 0.4663, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.5814843152257078, | |
| "grad_norm": 0.20145479660046747, | |
| "learning_rate": 0.0001946727077181062, | |
| "loss": 0.4647, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.5967865340474369, | |
| "grad_norm": 0.19812410832421173, | |
| "learning_rate": 0.00019408385084529014, | |
| "loss": 0.4669, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.612088752869166, | |
| "grad_norm": 0.20593747645685082, | |
| "learning_rate": 0.0001934651196053692, | |
| "loss": 0.4606, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.6273909716908952, | |
| "grad_norm": 0.19985586154896637, | |
| "learning_rate": 0.00019281671046356057, | |
| "loss": 0.465, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.6426931905126243, | |
| "grad_norm": 0.18804149710765375, | |
| "learning_rate": 0.0001921388293086812, | |
| "loss": 0.4591, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.6579954093343535, | |
| "grad_norm": 0.20403982281128524, | |
| "learning_rate": 0.00019143169138777176, | |
| "loss": 0.4612, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.6732976281560826, | |
| "grad_norm": 0.19472172680549107, | |
| "learning_rate": 0.00019069552123774966, | |
| "loss": 0.4535, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.6885998469778117, | |
| "grad_norm": 0.18853093267297413, | |
| "learning_rate": 0.00018993055261411188, | |
| "loss": 0.4536, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.703902065799541, | |
| "grad_norm": 0.1824795154817547, | |
| "learning_rate": 0.0001891370284167108, | |
| "loss": 0.4533, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.7192042846212701, | |
| "grad_norm": 0.19623573820167492, | |
| "learning_rate": 0.00018831520061262657, | |
| "loss": 0.4608, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.7345065034429993, | |
| "grad_norm": 0.1894732171051352, | |
| "learning_rate": 0.00018746533015615997, | |
| "loss": 0.4561, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.7498087222647284, | |
| "grad_norm": 0.1956746346911804, | |
| "learning_rate": 0.00018658768690597198, | |
| "loss": 0.4562, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.7651109410864575, | |
| "grad_norm": 0.19485019146231133, | |
| "learning_rate": 0.00018568254953939573, | |
| "loss": 0.4547, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.7804131599081867, | |
| "grad_norm": 0.1900932930656561, | |
| "learning_rate": 0.0001847502054639483, | |
| "loss": 0.4563, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.7957153787299158, | |
| "grad_norm": 0.18608837489031835, | |
| "learning_rate": 0.00018379095072607052, | |
| "loss": 0.4546, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.811017597551645, | |
| "grad_norm": 0.19229921508866923, | |
| "learning_rate": 0.0001828050899171234, | |
| "loss": 0.4506, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.8263198163733741, | |
| "grad_norm": 0.20164999528455674, | |
| "learning_rate": 0.00018179293607667178, | |
| "loss": 0.4528, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.8416220351951033, | |
| "grad_norm": 0.18962588415494633, | |
| "learning_rate": 0.00018075481059308488, | |
| "loss": 0.4501, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.8569242540168325, | |
| "grad_norm": 0.1878318467581915, | |
| "learning_rate": 0.00017969104310148627, | |
| "loss": 0.4489, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.8722264728385616, | |
| "grad_norm": 0.19855697462159105, | |
| "learning_rate": 0.00017860197137908504, | |
| "loss": 0.4486, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.8875286916602907, | |
| "grad_norm": 0.18806947832467016, | |
| "learning_rate": 0.00017748794123792169, | |
| "loss": 0.449, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.9028309104820199, | |
| "grad_norm": 0.18547154568221744, | |
| "learning_rate": 0.00017634930641506272, | |
| "loss": 0.4468, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.918133129303749, | |
| "grad_norm": 0.1988415349574148, | |
| "learning_rate": 0.00017518642846027876, | |
| "loss": 0.4447, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.9334353481254782, | |
| "grad_norm": 0.1971157063292002, | |
| "learning_rate": 0.00017399967662124204, | |
| "loss": 0.4496, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.9487375669472073, | |
| "grad_norm": 0.18821213785618154, | |
| "learning_rate": 0.00017278942772627954, | |
| "loss": 0.4442, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.9640397857689365, | |
| "grad_norm": 0.1863099123502169, | |
| "learning_rate": 0.00017155606606471873, | |
| "loss": 0.4428, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.9793420045906657, | |
| "grad_norm": 0.1877422977434543, | |
| "learning_rate": 0.00017029998326486485, | |
| "loss": 0.4431, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.9946442234123948, | |
| "grad_norm": 0.1976160670120698, | |
| "learning_rate": 0.00016902157816964724, | |
| "loss": 0.4382, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.009946442234124, | |
| "grad_norm": 0.18992306538475748, | |
| "learning_rate": 0.0001677212567099752, | |
| "loss": 0.4353, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.025248661055853, | |
| "grad_norm": 0.19209926834930846, | |
| "learning_rate": 0.00016639943177584302, | |
| "loss": 0.422, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.0405508798775822, | |
| "grad_norm": 0.1862611696289539, | |
| "learning_rate": 0.00016505652308522546, | |
| "loss": 0.4166, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.0558530986993113, | |
| "grad_norm": 0.19413670929825885, | |
| "learning_rate": 0.00016369295705080493, | |
| "loss": 0.4214, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.0711553175210407, | |
| "grad_norm": 0.19044466403960753, | |
| "learning_rate": 0.00016230916664457303, | |
| "loss": 0.4226, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.0864575363427698, | |
| "grad_norm": 0.20537259284486634, | |
| "learning_rate": 0.00016090559126034955, | |
| "loss": 0.4206, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.1017597551644989, | |
| "grad_norm": 0.19548453583302455, | |
| "learning_rate": 0.00015948267657426172, | |
| "loss": 0.4258, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.117061973986228, | |
| "grad_norm": 0.19233863084054012, | |
| "learning_rate": 0.00015804087440322937, | |
| "loss": 0.4212, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.132364192807957, | |
| "grad_norm": 0.20059506523438514, | |
| "learning_rate": 0.00015658064256149972, | |
| "loss": 0.4248, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.1476664116296864, | |
| "grad_norm": 0.19781190556296033, | |
| "learning_rate": 0.00015510244471527798, | |
| "loss": 0.419, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.1629686304514155, | |
| "grad_norm": 0.1978522622616877, | |
| "learning_rate": 0.0001536067502355, | |
| "loss": 0.4156, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.1782708492731446, | |
| "grad_norm": 0.19356090367390197, | |
| "learning_rate": 0.00015209403404879303, | |
| "loss": 0.4176, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.1935730680948737, | |
| "grad_norm": 0.21041098882140832, | |
| "learning_rate": 0.0001505647764866729, | |
| "loss": 0.4155, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.2088752869166028, | |
| "grad_norm": 0.19309297495512096, | |
| "learning_rate": 0.00014901946313302452, | |
| "loss": 0.4125, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.2241775057383322, | |
| "grad_norm": 0.20988205146033515, | |
| "learning_rate": 0.0001474585846699151, | |
| "loss": 0.4198, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.2394797245600613, | |
| "grad_norm": 0.19899429014402967, | |
| "learning_rate": 0.00014588263672178812, | |
| "loss": 0.4139, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.2547819433817904, | |
| "grad_norm": 0.19678065811835893, | |
| "learning_rate": 0.00014429211969808808, | |
| "loss": 0.4168, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.2700841622035195, | |
| "grad_norm": 0.20776458272762527, | |
| "learning_rate": 0.00014268753863436602, | |
| "loss": 0.4137, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.2853863810252486, | |
| "grad_norm": 0.2056945302468653, | |
| "learning_rate": 0.00014106940303191583, | |
| "loss": 0.4166, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.300688599846978, | |
| "grad_norm": 0.20731916359275016, | |
| "learning_rate": 0.000139438226695993, | |
| "loss": 0.4179, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.315990818668707, | |
| "grad_norm": 0.2001144508118885, | |
| "learning_rate": 0.00013779452757266617, | |
| "loss": 0.4131, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.3312930374904361, | |
| "grad_norm": 0.19624161193905473, | |
| "learning_rate": 0.00013613882758435435, | |
| "loss": 0.4089, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.3465952563121653, | |
| "grad_norm": 0.20129114730483852, | |
| "learning_rate": 0.0001344716524641012, | |
| "loss": 0.4149, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.3618974751338944, | |
| "grad_norm": 0.207327826679874, | |
| "learning_rate": 0.0001327935315886395, | |
| "loss": 0.4097, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.3771996939556237, | |
| "grad_norm": 0.2029298678382067, | |
| "learning_rate": 0.00013110499781029874, | |
| "loss": 0.4132, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.3925019127773526, | |
| "grad_norm": 0.20764777212748992, | |
| "learning_rate": 0.00012940658728780862, | |
| "loss": 0.4142, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.407804131599082, | |
| "grad_norm": 0.2017977764168179, | |
| "learning_rate": 0.00012769883931605333, | |
| "loss": 0.4212, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.423106350420811, | |
| "grad_norm": 0.1932903343458798, | |
| "learning_rate": 0.00012598229615482954, | |
| "loss": 0.4127, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.4384085692425401, | |
| "grad_norm": 0.20197440137591363, | |
| "learning_rate": 0.0001242575028566632, | |
| "loss": 0.4118, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.4537107880642695, | |
| "grad_norm": 0.20030727904644482, | |
| "learning_rate": 0.00012252500709373934, | |
| "loss": 0.4133, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.4690130068859983, | |
| "grad_norm": 0.20837354923941284, | |
| "learning_rate": 0.00012078535898400019, | |
| "loss": 0.4117, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.4843152257077277, | |
| "grad_norm": 0.19925595673498087, | |
| "learning_rate": 0.00011903911091646684, | |
| "loss": 0.4083, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.4996174445294568, | |
| "grad_norm": 0.2084484154721651, | |
| "learning_rate": 0.00011728681737583945, | |
| "loss": 0.408, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.5149196633511859, | |
| "grad_norm": 0.20623569681522963, | |
| "learning_rate": 0.00011552903476643222, | |
| "loss": 0.4117, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.5302218821729152, | |
| "grad_norm": 0.19912897780166905, | |
| "learning_rate": 0.0001137663212354988, | |
| "loss": 0.4071, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.545524100994644, | |
| "grad_norm": 0.2015032395684377, | |
| "learning_rate": 0.00011199923649600432, | |
| "loss": 0.4085, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.5608263198163734, | |
| "grad_norm": 0.19846674785205634, | |
| "learning_rate": 0.0001102283416489001, | |
| "loss": 0.4094, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.5761285386381025, | |
| "grad_norm": 0.19503368149396527, | |
| "learning_rate": 0.00010845419900495772, | |
| "loss": 0.4031, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.5914307574598316, | |
| "grad_norm": 0.20636304448971973, | |
| "learning_rate": 0.00010667737190621911, | |
| "loss": 0.4141, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.606732976281561, | |
| "grad_norm": 0.19741819850815578, | |
| "learning_rate": 0.0001048984245471188, | |
| "loss": 0.4123, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.6220351951032899, | |
| "grad_norm": 0.208094236192064, | |
| "learning_rate": 0.00010311792179533589, | |
| "loss": 0.4094, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.6373374139250192, | |
| "grad_norm": 0.20834394157642375, | |
| "learning_rate": 0.00010133642901243199, | |
| "loss": 0.4073, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.6526396327467483, | |
| "grad_norm": 0.20325976490051528, | |
| "learning_rate": 9.955451187433249e-05, | |
| "loss": 0.4072, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.6679418515684774, | |
| "grad_norm": 0.19996940436744937, | |
| "learning_rate": 9.777273619170796e-05, | |
| "loss": 0.4067, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.6832440703902067, | |
| "grad_norm": 0.19609861659837347, | |
| "learning_rate": 9.599166773031269e-05, | |
| "loss": 0.405, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.6985462892119356, | |
| "grad_norm": 0.20661663114073753, | |
| "learning_rate": 9.421187203133763e-05, | |
| "loss": 0.4089, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.713848508033665, | |
| "grad_norm": 0.19921721692896047, | |
| "learning_rate": 9.243391423183448e-05, | |
| "loss": 0.4071, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.729150726855394, | |
| "grad_norm": 0.20297141939912075, | |
| "learning_rate": 9.06583588852683e-05, | |
| "loss": 0.4043, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.7444529456771232, | |
| "grad_norm": 0.20714391729089626, | |
| "learning_rate": 8.888576978225527e-05, | |
| "loss": 0.4031, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.7597551644988525, | |
| "grad_norm": 0.20284798828869055, | |
| "learning_rate": 8.711670977154274e-05, | |
| "loss": 0.4051, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.7750573833205814, | |
| "grad_norm": 0.20621816413716504, | |
| "learning_rate": 8.535174058128812e-05, | |
| "loss": 0.4028, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.7903596021423107, | |
| "grad_norm": 0.2107861816265634, | |
| "learning_rate": 8.359142264069424e-05, | |
| "loss": 0.3972, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.8056618209640398, | |
| "grad_norm": 0.20926246575879495, | |
| "learning_rate": 8.183631490205637e-05, | |
| "loss": 0.4033, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.820964039785769, | |
| "grad_norm": 0.20398570550310058, | |
| "learning_rate": 8.008697466327865e-05, | |
| "loss": 0.3994, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.836266258607498, | |
| "grad_norm": 0.21408610304309247, | |
| "learning_rate": 7.834395739091585e-05, | |
| "loss": 0.3971, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.8515684774292271, | |
| "grad_norm": 0.2059678534725396, | |
| "learning_rate": 7.660781654379638e-05, | |
| "loss": 0.4004, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.8668706962509565, | |
| "grad_norm": 0.20089108656511492, | |
| "learning_rate": 7.487910339728308e-05, | |
| "loss": 0.398, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.8821729150726856, | |
| "grad_norm": 0.2089057556225053, | |
| "learning_rate": 7.315836686822729e-05, | |
| "loss": 0.4, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.8974751338944147, | |
| "grad_norm": 0.2076979091587327, | |
| "learning_rate": 7.14461533406714e-05, | |
| "loss": 0.3942, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.9127773527161438, | |
| "grad_norm": 0.2075858197523019, | |
| "learning_rate": 6.974300649235633e-05, | |
| "loss": 0.4004, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.928079571537873, | |
| "grad_norm": 0.21210310960815237, | |
| "learning_rate": 6.804946712208793e-05, | |
| "loss": 0.4021, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.9433817903596022, | |
| "grad_norm": 0.21341887577251997, | |
| "learning_rate": 6.63660729780174e-05, | |
| "loss": 0.3954, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.9586840091813313, | |
| "grad_norm": 0.22172559452313256, | |
| "learning_rate": 6.469335858689074e-05, | |
| "loss": 0.4002, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.9739862280030605, | |
| "grad_norm": 0.20709389048082577, | |
| "learning_rate": 6.303185508432085e-05, | |
| "loss": 0.4018, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.9892884468247896, | |
| "grad_norm": 0.21370736036441443, | |
| "learning_rate": 6.138209004613647e-05, | |
| "loss": 0.3955, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.0045906656465187, | |
| "grad_norm": 0.20094204383719153, | |
| "learning_rate": 5.974458732086149e-05, | |
| "loss": 0.3851, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.019892884468248, | |
| "grad_norm": 0.22289055701062488, | |
| "learning_rate": 5.81198668633778e-05, | |
| "loss": 0.3674, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.035195103289977, | |
| "grad_norm": 0.22724332895866212, | |
| "learning_rate": 5.6508444569824315e-05, | |
| "loss": 0.3614, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.050497322111706, | |
| "grad_norm": 0.23888425948419684, | |
| "learning_rate": 5.491083211378505e-05, | |
| "loss": 0.3614, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.0657995409334355, | |
| "grad_norm": 0.23068081709857463, | |
| "learning_rate": 5.3327536783817766e-05, | |
| "loss": 0.3644, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.0811017597551644, | |
| "grad_norm": 0.2460582871625401, | |
| "learning_rate": 5.1759061322375045e-05, | |
| "loss": 0.3634, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.0964039785768938, | |
| "grad_norm": 0.24787239171921766, | |
| "learning_rate": 5.0205903766168915e-05, | |
| "loss": 0.3612, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.1117061973986226, | |
| "grad_norm": 0.23563510103472043, | |
| "learning_rate": 4.8668557288029684e-05, | |
| "loss": 0.3627, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.127008416220352, | |
| "grad_norm": 0.23966764070729765, | |
| "learning_rate": 4.7147510040309115e-05, | |
| "loss": 0.3633, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.1423106350420813, | |
| "grad_norm": 0.24153476110732292, | |
| "learning_rate": 4.56432449998779e-05, | |
| "loss": 0.3621, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.15761285386381, | |
| "grad_norm": 0.23966649969853318, | |
| "learning_rate": 4.4156239814766355e-05, | |
| "loss": 0.3597, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.1729150726855395, | |
| "grad_norm": 0.2492626290666687, | |
| "learning_rate": 4.268696665249724e-05, | |
| "loss": 0.3663, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.1882172915072684, | |
| "grad_norm": 0.24762132725809086, | |
| "learning_rate": 4.1235892050158866e-05, | |
| "loss": 0.3604, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.2035195103289977, | |
| "grad_norm": 0.24689838661058983, | |
| "learning_rate": 3.9803476766265835e-05, | |
| "loss": 0.3648, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.218821729150727, | |
| "grad_norm": 0.24705256051578678, | |
| "learning_rate": 3.839017563445489e-05, | |
| "loss": 0.3625, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.234123947972456, | |
| "grad_norm": 0.2496443256139868, | |
| "learning_rate": 3.699643741906193e-05, | |
| "loss": 0.3545, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.2494261667941853, | |
| "grad_norm": 0.249005841241916, | |
| "learning_rate": 3.562270467262619e-05, | |
| "loss": 0.3601, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.264728385615914, | |
| "grad_norm": 0.24757649232051404, | |
| "learning_rate": 3.426941359536699e-05, | |
| "loss": 0.3576, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.2800306044376435, | |
| "grad_norm": 0.25451870114574876, | |
| "learning_rate": 3.293699389667734e-05, | |
| "loss": 0.3648, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.295332823259373, | |
| "grad_norm": 0.2453131567916715, | |
| "learning_rate": 3.1625868658678784e-05, | |
| "loss": 0.3625, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.3106350420811017, | |
| "grad_norm": 0.26123253732278273, | |
| "learning_rate": 3.0336454201880404e-05, | |
| "loss": 0.3557, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 2.325937260902831, | |
| "grad_norm": 0.2626580334343511, | |
| "learning_rate": 2.9069159952984938e-05, | |
| "loss": 0.3566, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 2.34123947972456, | |
| "grad_norm": 0.25003501828393526, | |
| "learning_rate": 2.7824388314883876e-05, | |
| "loss": 0.3591, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.3565416985462893, | |
| "grad_norm": 0.2507207739640118, | |
| "learning_rate": 2.6602534538882752e-05, | |
| "loss": 0.358, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.371843917368018, | |
| "grad_norm": 0.2512257287691549, | |
| "learning_rate": 2.5403986599197403e-05, | |
| "loss": 0.3501, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.3871461361897475, | |
| "grad_norm": 0.2499142391014787, | |
| "learning_rate": 2.4229125069760773e-05, | |
| "loss": 0.3517, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.402448355011477, | |
| "grad_norm": 0.24884412260444377, | |
| "learning_rate": 2.30783230033796e-05, | |
| "loss": 0.3568, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 2.4177505738332057, | |
| "grad_norm": 0.25335712662759613, | |
| "learning_rate": 2.1951945813279306e-05, | |
| "loss": 0.3547, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.433052792654935, | |
| "grad_norm": 0.24937746984746909, | |
| "learning_rate": 2.0850351157074598e-05, | |
| "loss": 0.353, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.4483550114766643, | |
| "grad_norm": 0.26237405083230225, | |
| "learning_rate": 1.9773888823202747e-05, | |
| "loss": 0.3579, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.4636572302983932, | |
| "grad_norm": 0.24937407019841143, | |
| "learning_rate": 1.8722900619855577e-05, | |
| "loss": 0.3562, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 2.4789594491201226, | |
| "grad_norm": 0.2497739493217595, | |
| "learning_rate": 1.7697720266445374e-05, | |
| "loss": 0.3512, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.4942616679418514, | |
| "grad_norm": 0.250399923768968, | |
| "learning_rate": 1.6698673287639242e-05, | |
| "loss": 0.3556, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 2.5095638867635808, | |
| "grad_norm": 0.2538234603963528, | |
| "learning_rate": 1.5726076909995525e-05, | |
| "loss": 0.355, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 2.5248661055853097, | |
| "grad_norm": 0.2504434686740776, | |
| "learning_rate": 1.4780239961235143e-05, | |
| "loss": 0.3581, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.540168324407039, | |
| "grad_norm": 0.2537006354023553, | |
| "learning_rate": 1.3861462772179735e-05, | |
| "loss": 0.3529, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.5554705432287683, | |
| "grad_norm": 0.2560925692389684, | |
| "learning_rate": 1.297003708138792e-05, | |
| "loss": 0.3584, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.570772762050497, | |
| "grad_norm": 0.25340005468102866, | |
| "learning_rate": 1.2106245942519745e-05, | |
| "loss": 0.3562, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.5860749808722265, | |
| "grad_norm": 0.25785387176401525, | |
| "learning_rate": 1.1270363634458903e-05, | |
| "loss": 0.3541, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 2.601377199693956, | |
| "grad_norm": 0.2615280811952495, | |
| "learning_rate": 1.0462655574221213e-05, | |
| "loss": 0.3553, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.6166794185156848, | |
| "grad_norm": 0.24663819526398556, | |
| "learning_rate": 9.683378232676965e-06, | |
| "loss": 0.3532, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.631981637337414, | |
| "grad_norm": 0.2614751928374674, | |
| "learning_rate": 8.932779053113893e-06, | |
| "loss": 0.3546, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 2.647283856159143, | |
| "grad_norm": 0.25884670404819016, | |
| "learning_rate": 8.211096372666783e-06, | |
| "loss": 0.3559, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 2.6625860749808723, | |
| "grad_norm": 0.26303763365287397, | |
| "learning_rate": 7.518559346638432e-06, | |
| "loss": 0.3556, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 2.677888293802601, | |
| "grad_norm": 0.2591833975916801, | |
| "learning_rate": 6.855387875736152e-06, | |
| "loss": 0.3577, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.6931905126243305, | |
| "grad_norm": 0.24688502674320847, | |
| "learning_rate": 6.221792536246973e-06, | |
| "loss": 0.3557, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 2.70849273144606, | |
| "grad_norm": 0.26100100849151864, | |
| "learning_rate": 5.617974513173341e-06, | |
| "loss": 0.3548, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.7237949502677887, | |
| "grad_norm": 0.2699346579349574, | |
| "learning_rate": 5.044125536351196e-06, | |
| "loss": 0.3511, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 2.739097169089518, | |
| "grad_norm": 0.2717440390716044, | |
| "learning_rate": 4.500427819570097e-06, | |
| "loss": 0.3494, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 2.7543993879112474, | |
| "grad_norm": 0.2579283254483208, | |
| "learning_rate": 3.987054002714952e-06, | |
| "loss": 0.3569, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.7697016067329763, | |
| "grad_norm": 0.2594723269046269, | |
| "learning_rate": 3.504167096947952e-06, | |
| "loss": 0.3516, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 2.785003825554705, | |
| "grad_norm": 0.256455133609715, | |
| "learning_rate": 3.051920432947664e-06, | |
| "loss": 0.3512, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 2.8003060443764345, | |
| "grad_norm": 0.2621246657840175, | |
| "learning_rate": 2.6304576122221035e-06, | |
| "loss": 0.3527, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.815608263198164, | |
| "grad_norm": 0.26278995363196544, | |
| "learning_rate": 2.2399124615110846e-06, | |
| "loss": 0.3518, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 2.8309104820198927, | |
| "grad_norm": 0.27091602282401345, | |
| "learning_rate": 1.880408990292315e-06, | |
| "loss": 0.3581, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.846212700841622, | |
| "grad_norm": 0.25572786473847814, | |
| "learning_rate": 1.5520613514047655e-06, | |
| "loss": 0.3562, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.8615149196633514, | |
| "grad_norm": 0.26244627787054337, | |
| "learning_rate": 1.2549738048017846e-06, | |
| "loss": 0.3552, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 2.8768171384850802, | |
| "grad_norm": 0.26131103676147655, | |
| "learning_rate": 9.892406844456026e-07, | |
| "loss": 0.352, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 2.8921193573068096, | |
| "grad_norm": 0.27449669022834305, | |
| "learning_rate": 7.549463683534374e-07, | |
| "loss": 0.3504, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 2.907421576128539, | |
| "grad_norm": 0.264605800831863, | |
| "learning_rate": 5.521652518051368e-07, | |
| "loss": 0.3515, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.922723794950268, | |
| "grad_norm": 0.2618398016329481, | |
| "learning_rate": 3.809617237203744e-07, | |
| "loss": 0.3561, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 2.9380260137719967, | |
| "grad_norm": 0.26831015620903936, | |
| "learning_rate": 2.4139014621340494e-07, | |
| "loss": 0.3527, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.953328232593726, | |
| "grad_norm": 0.26666302239559175, | |
| "learning_rate": 1.334948373314493e-07, | |
| "loss": 0.3562, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 2.9686304514154553, | |
| "grad_norm": 0.262359494195635, | |
| "learning_rate": 5.7310056982418094e-08, | |
| "loss": 0.355, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 2.9839326702371842, | |
| "grad_norm": 0.2576422839291312, | |
| "learning_rate": 1.2859996056402423e-08, | |
| "loss": 0.3531, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.9977046671767407, | |
| "step": 1959, | |
| "total_flos": 3660773289099264.0, | |
| "train_loss": 0.4337311900958168, | |
| "train_runtime": 5331.8775, | |
| "train_samples_per_second": 5.882, | |
| "train_steps_per_second": 0.367 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1959, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3660773289099264.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |