| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 2691, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.011148272017837236, | |
| "grad_norm": 3.7741404834185763, | |
| "learning_rate": 3.7037037037037036e-07, | |
| "loss": 0.3679, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.022296544035674472, | |
| "grad_norm": 1.8040349271595875, | |
| "learning_rate": 7.407407407407407e-07, | |
| "loss": 0.3385, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.033444816053511704, | |
| "grad_norm": 1.1311364667628296, | |
| "learning_rate": 1.111111111111111e-06, | |
| "loss": 0.3008, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.044593088071348944, | |
| "grad_norm": 1.096280133534896, | |
| "learning_rate": 1.4814814814814815e-06, | |
| "loss": 0.2689, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.055741360089186176, | |
| "grad_norm": 0.7223895222097435, | |
| "learning_rate": 1.8518518518518519e-06, | |
| "loss": 0.2506, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.06688963210702341, | |
| "grad_norm": 0.6507379554846837, | |
| "learning_rate": 2.222222222222222e-06, | |
| "loss": 0.24, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.07803790412486064, | |
| "grad_norm": 0.5228577418159993, | |
| "learning_rate": 2.5925925925925925e-06, | |
| "loss": 0.2374, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.08918617614269789, | |
| "grad_norm": 0.5725671090349675, | |
| "learning_rate": 2.962962962962963e-06, | |
| "loss": 0.2343, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.10033444816053512, | |
| "grad_norm": 0.575895692421762, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.2279, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.11148272017837235, | |
| "grad_norm": 0.5723674270595959, | |
| "learning_rate": 3.7037037037037037e-06, | |
| "loss": 0.2263, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.12263099219620958, | |
| "grad_norm": 0.6264644346158856, | |
| "learning_rate": 4.074074074074074e-06, | |
| "loss": 0.2239, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.13377926421404682, | |
| "grad_norm": 0.7124762074512575, | |
| "learning_rate": 4.444444444444444e-06, | |
| "loss": 0.2232, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.14492753623188406, | |
| "grad_norm": 0.5878673377883962, | |
| "learning_rate": 4.814814814814815e-06, | |
| "loss": 0.2231, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.15607580824972128, | |
| "grad_norm": 0.821524645221957, | |
| "learning_rate": 5.185185185185185e-06, | |
| "loss": 0.2215, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.16722408026755853, | |
| "grad_norm": 0.6626631852309735, | |
| "learning_rate": 5.555555555555557e-06, | |
| "loss": 0.2203, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.17837235228539577, | |
| "grad_norm": 0.7549263527576757, | |
| "learning_rate": 5.925925925925926e-06, | |
| "loss": 0.2186, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.189520624303233, | |
| "grad_norm": 0.695178742046847, | |
| "learning_rate": 6.296296296296297e-06, | |
| "loss": 0.2184, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.20066889632107024, | |
| "grad_norm": 0.7460261830067356, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.22, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.21181716833890746, | |
| "grad_norm": 0.7489258972164311, | |
| "learning_rate": 7.0370370370370375e-06, | |
| "loss": 0.2181, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.2229654403567447, | |
| "grad_norm": 0.7563607766637321, | |
| "learning_rate": 7.4074074074074075e-06, | |
| "loss": 0.221, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.23411371237458195, | |
| "grad_norm": 0.6786891318993898, | |
| "learning_rate": 7.77777777777778e-06, | |
| "loss": 0.2188, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.24526198439241917, | |
| "grad_norm": 0.718723752297412, | |
| "learning_rate": 8.148148148148148e-06, | |
| "loss": 0.2191, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.2564102564102564, | |
| "grad_norm": 0.79515342774086, | |
| "learning_rate": 8.518518518518519e-06, | |
| "loss": 0.222, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.26755852842809363, | |
| "grad_norm": 0.6773955348554602, | |
| "learning_rate": 8.888888888888888e-06, | |
| "loss": 0.2189, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.2787068004459309, | |
| "grad_norm": 0.6019195042882624, | |
| "learning_rate": 9.25925925925926e-06, | |
| "loss": 0.2177, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.2898550724637681, | |
| "grad_norm": 0.6564030437537817, | |
| "learning_rate": 9.62962962962963e-06, | |
| "loss": 0.2202, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.3010033444816054, | |
| "grad_norm": 0.6392190084647338, | |
| "learning_rate": 1e-05, | |
| "loss": 0.217, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.31215161649944256, | |
| "grad_norm": 0.6409199129954214, | |
| "learning_rate": 9.999579036849509e-06, | |
| "loss": 0.2184, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3232998885172798, | |
| "grad_norm": 0.6213419108722281, | |
| "learning_rate": 9.998316218282025e-06, | |
| "loss": 0.2165, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.33444816053511706, | |
| "grad_norm": 0.6837626955419673, | |
| "learning_rate": 9.996211756937579e-06, | |
| "loss": 0.2153, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.3455964325529543, | |
| "grad_norm": 0.6153911123312514, | |
| "learning_rate": 9.993266007176446e-06, | |
| "loss": 0.2183, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.35674470457079155, | |
| "grad_norm": 0.6491533849083898, | |
| "learning_rate": 9.989479465019462e-06, | |
| "loss": 0.2168, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.36789297658862874, | |
| "grad_norm": 0.6331404269468602, | |
| "learning_rate": 9.984852768064516e-06, | |
| "loss": 0.2164, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.379041248606466, | |
| "grad_norm": 0.6066866794436855, | |
| "learning_rate": 9.979386695379179e-06, | |
| "loss": 0.2157, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.39018952062430323, | |
| "grad_norm": 0.6283882559447943, | |
| "learning_rate": 9.973082167369521e-06, | |
| "loss": 0.2144, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.4013377926421405, | |
| "grad_norm": 0.620915432947406, | |
| "learning_rate": 9.965940245625131e-06, | |
| "loss": 0.2109, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.4124860646599777, | |
| "grad_norm": 0.5581175223325301, | |
| "learning_rate": 9.95796213274036e-06, | |
| "loss": 0.2159, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.4236343366778149, | |
| "grad_norm": 0.7003333387574542, | |
| "learning_rate": 9.949149172111825e-06, | |
| "loss": 0.2154, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.43478260869565216, | |
| "grad_norm": 0.6965889831011358, | |
| "learning_rate": 9.93950284771219e-06, | |
| "loss": 0.215, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.4459308807134894, | |
| "grad_norm": 0.6783561953012276, | |
| "learning_rate": 9.929024783840302e-06, | |
| "loss": 0.2156, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.45707915273132665, | |
| "grad_norm": 0.6133815858556971, | |
| "learning_rate": 9.917716744847674e-06, | |
| "loss": 0.2135, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.4682274247491639, | |
| "grad_norm": 0.5517297679124203, | |
| "learning_rate": 9.905580634841388e-06, | |
| "loss": 0.2127, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.4793756967670011, | |
| "grad_norm": 0.5672097715363509, | |
| "learning_rate": 9.892618497363491e-06, | |
| "loss": 0.2152, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.49052396878483834, | |
| "grad_norm": 0.7373862211696988, | |
| "learning_rate": 9.878832515046873e-06, | |
| "loss": 0.2121, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.5016722408026756, | |
| "grad_norm": 0.5683396526691493, | |
| "learning_rate": 9.864225009247753e-06, | |
| "loss": 0.2124, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.5128205128205128, | |
| "grad_norm": 0.5991852837643047, | |
| "learning_rate": 9.848798439654797e-06, | |
| "loss": 0.2116, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.5239687848383501, | |
| "grad_norm": 0.5945184596013314, | |
| "learning_rate": 9.832555403874937e-06, | |
| "loss": 0.2107, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5351170568561873, | |
| "grad_norm": 0.5308451359275891, | |
| "learning_rate": 9.815498636995982e-06, | |
| "loss": 0.2092, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.5462653288740246, | |
| "grad_norm": 0.5709398866928973, | |
| "learning_rate": 9.797631011126061e-06, | |
| "loss": 0.2098, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.5574136008918618, | |
| "grad_norm": 0.570042268652287, | |
| "learning_rate": 9.778955534910004e-06, | |
| "loss": 0.2107, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5574136008918618, | |
| "eval_accuracy": 0.9207988997610126, | |
| "eval_loss": 0.21421977877616882, | |
| "eval_runtime": 5.3128, | |
| "eval_samples_per_second": 43.292, | |
| "eval_steps_per_second": 1.506, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.568561872909699, | |
| "grad_norm": 0.6073438181861355, | |
| "learning_rate": 9.759475353022735e-06, | |
| "loss": 0.2096, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.5797101449275363, | |
| "grad_norm": 0.5805559872103752, | |
| "learning_rate": 9.739193745639745e-06, | |
| "loss": 0.2088, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.5908584169453734, | |
| "grad_norm": 0.569674689944485, | |
| "learning_rate": 9.718114127884774e-06, | |
| "loss": 0.2079, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.6020066889632107, | |
| "grad_norm": 0.6479348013496814, | |
| "learning_rate": 9.696240049254744e-06, | |
| "loss": 0.2095, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.6131549609810479, | |
| "grad_norm": 0.6837993590888451, | |
| "learning_rate": 9.673575193022073e-06, | |
| "loss": 0.2103, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.6243032329988851, | |
| "grad_norm": 0.5658586658235949, | |
| "learning_rate": 9.650123375614477e-06, | |
| "loss": 0.209, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.6354515050167224, | |
| "grad_norm": 0.525804021378744, | |
| "learning_rate": 9.625888545972333e-06, | |
| "loss": 0.2109, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.6465997770345596, | |
| "grad_norm": 0.5027074127285335, | |
| "learning_rate": 9.600874784883736e-06, | |
| "loss": 0.2072, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.6577480490523969, | |
| "grad_norm": 0.5503527245541562, | |
| "learning_rate": 9.575086304297351e-06, | |
| "loss": 0.2068, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.6688963210702341, | |
| "grad_norm": 0.5786235649699455, | |
| "learning_rate": 9.548527446613198e-06, | |
| "loss": 0.2088, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.6800445930880713, | |
| "grad_norm": 0.5963491322528846, | |
| "learning_rate": 9.521202683951436e-06, | |
| "loss": 0.2079, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.6911928651059086, | |
| "grad_norm": 0.5483351877002366, | |
| "learning_rate": 9.493116617399333e-06, | |
| "loss": 0.2086, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.7023411371237458, | |
| "grad_norm": 0.5231384119737766, | |
| "learning_rate": 9.464273976236518e-06, | |
| "loss": 0.2079, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.7134894091415831, | |
| "grad_norm": 0.5728914844384684, | |
| "learning_rate": 9.434679617138624e-06, | |
| "loss": 0.207, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.7246376811594203, | |
| "grad_norm": 0.6056684783613336, | |
| "learning_rate": 9.404338523359511e-06, | |
| "loss": 0.2072, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.7357859531772575, | |
| "grad_norm": 0.48483746742809725, | |
| "learning_rate": 9.373255803892149e-06, | |
| "loss": 0.2043, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.7469342251950948, | |
| "grad_norm": 0.5052853090283119, | |
| "learning_rate": 9.341436692608341e-06, | |
| "loss": 0.2059, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.758082497212932, | |
| "grad_norm": 0.5013961896834831, | |
| "learning_rate": 9.30888654737742e-06, | |
| "loss": 0.2045, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.7692307692307693, | |
| "grad_norm": 0.6136168534682885, | |
| "learning_rate": 9.275610849164066e-06, | |
| "loss": 0.2037, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.7803790412486065, | |
| "grad_norm": 0.5454932461837725, | |
| "learning_rate": 9.241615201105372e-06, | |
| "loss": 0.2055, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.7915273132664437, | |
| "grad_norm": 0.630054882059704, | |
| "learning_rate": 9.206905327567385e-06, | |
| "loss": 0.2084, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.802675585284281, | |
| "grad_norm": 0.5115790207210278, | |
| "learning_rate": 9.171487073181198e-06, | |
| "loss": 0.2054, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.8138238573021181, | |
| "grad_norm": 0.522185957547097, | |
| "learning_rate": 9.135366401858782e-06, | |
| "loss": 0.2073, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.8249721293199554, | |
| "grad_norm": 0.5370558619096407, | |
| "learning_rate": 9.098549395788784e-06, | |
| "loss": 0.2073, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.8361204013377926, | |
| "grad_norm": 0.5143052735973912, | |
| "learning_rate": 9.061042254412346e-06, | |
| "loss": 0.2039, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.8472686733556298, | |
| "grad_norm": 0.5271311850347303, | |
| "learning_rate": 9.022851293379232e-06, | |
| "loss": 0.2053, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.8584169453734671, | |
| "grad_norm": 0.54232371019945, | |
| "learning_rate": 8.983982943484348e-06, | |
| "loss": 0.2023, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.8695652173913043, | |
| "grad_norm": 0.5365286519965239, | |
| "learning_rate": 8.944443749584908e-06, | |
| "loss": 0.2043, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.8807134894091416, | |
| "grad_norm": 0.49203900940366796, | |
| "learning_rate": 8.904240369498363e-06, | |
| "loss": 0.2013, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.8918617614269788, | |
| "grad_norm": 0.6067905351468151, | |
| "learning_rate": 8.863379572881328e-06, | |
| "loss": 0.2038, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.903010033444816, | |
| "grad_norm": 0.5687299547717642, | |
| "learning_rate": 8.821868240089676e-06, | |
| "loss": 0.2044, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.9141583054626533, | |
| "grad_norm": 0.5124900416142478, | |
| "learning_rate": 8.779713361019983e-06, | |
| "loss": 0.2068, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.9253065774804905, | |
| "grad_norm": 0.48310445826631493, | |
| "learning_rate": 8.736922033932522e-06, | |
| "loss": 0.2022, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.9364548494983278, | |
| "grad_norm": 0.5272157184999492, | |
| "learning_rate": 8.693501464256046e-06, | |
| "loss": 0.2027, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.947603121516165, | |
| "grad_norm": 0.5270482869515378, | |
| "learning_rate": 8.649458963374474e-06, | |
| "loss": 0.2032, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.9587513935340022, | |
| "grad_norm": 0.5738980238907102, | |
| "learning_rate": 8.604801947395776e-06, | |
| "loss": 0.2, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.9698996655518395, | |
| "grad_norm": 0.4603491880478103, | |
| "learning_rate": 8.55953793590321e-06, | |
| "loss": 0.2021, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.9810479375696767, | |
| "grad_norm": 0.548549890115731, | |
| "learning_rate": 8.513674550689128e-06, | |
| "loss": 0.2002, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.992196209587514, | |
| "grad_norm": 0.49756226702185397, | |
| "learning_rate": 8.46721951447158e-06, | |
| "loss": 0.2024, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.0033444816053512, | |
| "grad_norm": 0.6472920299598118, | |
| "learning_rate": 8.42018064959393e-06, | |
| "loss": 0.1903, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.0144927536231885, | |
| "grad_norm": 0.5997372944954878, | |
| "learning_rate": 8.37256587670768e-06, | |
| "loss": 0.1599, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.0256410256410255, | |
| "grad_norm": 0.6661365745997817, | |
| "learning_rate": 8.32438321343875e-06, | |
| "loss": 0.1578, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.0367892976588629, | |
| "grad_norm": 0.5526716468068121, | |
| "learning_rate": 8.275640773037432e-06, | |
| "loss": 0.1584, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.0479375696767002, | |
| "grad_norm": 0.5311689895480008, | |
| "learning_rate": 8.226346763012234e-06, | |
| "loss": 0.1591, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.0590858416945372, | |
| "grad_norm": 0.6216949836162623, | |
| "learning_rate": 8.176509483747863e-06, | |
| "loss": 0.1587, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.0702341137123745, | |
| "grad_norm": 0.5643715674767006, | |
| "learning_rate": 8.126137327107556e-06, | |
| "loss": 0.1559, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.0813823857302118, | |
| "grad_norm": 0.528226825221165, | |
| "learning_rate": 8.075238775020011e-06, | |
| "loss": 0.1582, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.0925306577480491, | |
| "grad_norm": 0.5303123877825515, | |
| "learning_rate": 8.023822398051168e-06, | |
| "loss": 0.1588, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.1036789297658862, | |
| "grad_norm": 0.5437405641707471, | |
| "learning_rate": 7.971896853961043e-06, | |
| "loss": 0.1576, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.1148272017837235, | |
| "grad_norm": 0.5691619401154729, | |
| "learning_rate": 7.919470886245886e-06, | |
| "loss": 0.1561, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.1148272017837235, | |
| "eval_accuracy": 0.9239298004895466, | |
| "eval_loss": 0.20845723152160645, | |
| "eval_runtime": 4.947, | |
| "eval_samples_per_second": 46.493, | |
| "eval_steps_per_second": 1.617, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.1259754738015608, | |
| "grad_norm": 0.47581750607341355, | |
| "learning_rate": 7.86655332266591e-06, | |
| "loss": 0.1586, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.137123745819398, | |
| "grad_norm": 0.6353712929162172, | |
| "learning_rate": 7.813153073758833e-06, | |
| "loss": 0.1582, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.1482720178372352, | |
| "grad_norm": 0.5419698538850736, | |
| "learning_rate": 7.759279131339455e-06, | |
| "loss": 0.1579, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.1594202898550725, | |
| "grad_norm": 0.5730038786065867, | |
| "learning_rate": 7.704940566985592e-06, | |
| "loss": 0.163, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.1705685618729098, | |
| "grad_norm": 0.5280364919184873, | |
| "learning_rate": 7.650146530510542e-06, | |
| "loss": 0.1617, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.1817168338907469, | |
| "grad_norm": 0.544836252028228, | |
| "learning_rate": 7.594906248422392e-06, | |
| "loss": 0.1596, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.1928651059085842, | |
| "grad_norm": 0.5378141986574141, | |
| "learning_rate": 7.539229022370418e-06, | |
| "loss": 0.1603, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.2040133779264215, | |
| "grad_norm": 0.5500474096682983, | |
| "learning_rate": 7.483124227578811e-06, | |
| "loss": 0.1579, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.2151616499442586, | |
| "grad_norm": 0.5018225238945918, | |
| "learning_rate": 7.426601311268043e-06, | |
| "loss": 0.16, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.2263099219620959, | |
| "grad_norm": 0.575725285027047, | |
| "learning_rate": 7.369669791064085e-06, | |
| "loss": 0.1596, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.2374581939799332, | |
| "grad_norm": 0.47947733030163836, | |
| "learning_rate": 7.312339253395778e-06, | |
| "loss": 0.1582, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.2486064659977703, | |
| "grad_norm": 0.4980356373309633, | |
| "learning_rate": 7.254619351880625e-06, | |
| "loss": 0.1581, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.2597547380156076, | |
| "grad_norm": 0.578756138117811, | |
| "learning_rate": 7.1965198056992615e-06, | |
| "loss": 0.1592, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.2709030100334449, | |
| "grad_norm": 0.4635139388152137, | |
| "learning_rate": 7.138050397958889e-06, | |
| "loss": 0.1595, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.282051282051282, | |
| "grad_norm": 0.49482297151736354, | |
| "learning_rate": 7.079220974045941e-06, | |
| "loss": 0.1584, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.2931995540691192, | |
| "grad_norm": 0.5460979920247649, | |
| "learning_rate": 7.0200414399682745e-06, | |
| "loss": 0.1584, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.3043478260869565, | |
| "grad_norm": 0.48148732576426634, | |
| "learning_rate": 6.96052176068713e-06, | |
| "loss": 0.1576, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.3154960981047936, | |
| "grad_norm": 0.5821793738001766, | |
| "learning_rate": 6.900671958439192e-06, | |
| "loss": 0.1599, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.326644370122631, | |
| "grad_norm": 0.48883238562313075, | |
| "learning_rate": 6.840502111048982e-06, | |
| "loss": 0.158, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.3377926421404682, | |
| "grad_norm": 0.5974171958012616, | |
| "learning_rate": 6.780022350231912e-06, | |
| "loss": 0.1604, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.3489409141583055, | |
| "grad_norm": 0.5659057198248688, | |
| "learning_rate": 6.719242859888243e-06, | |
| "loss": 0.1576, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.3600891861761428, | |
| "grad_norm": 0.6417774138796649, | |
| "learning_rate": 6.65817387438827e-06, | |
| "loss": 0.1588, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.37123745819398, | |
| "grad_norm": 0.5414201206440971, | |
| "learning_rate": 6.596825676849006e-06, | |
| "loss": 0.1581, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.3823857302118172, | |
| "grad_norm": 0.513016285464973, | |
| "learning_rate": 6.535208597402658e-06, | |
| "loss": 0.1588, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.3935340022296545, | |
| "grad_norm": 0.5513989056143007, | |
| "learning_rate": 6.473333011457181e-06, | |
| "loss": 0.1586, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.4046822742474916, | |
| "grad_norm": 0.5049419184837916, | |
| "learning_rate": 6.411209337949214e-06, | |
| "loss": 0.1594, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.415830546265329, | |
| "grad_norm": 0.5217098963627814, | |
| "learning_rate": 6.348848037589683e-06, | |
| "loss": 0.159, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.4269788182831662, | |
| "grad_norm": 0.5169737196009081, | |
| "learning_rate": 6.286259611102376e-06, | |
| "loss": 0.1589, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.4381270903010033, | |
| "grad_norm": 0.5747675813836013, | |
| "learning_rate": 6.223454597455776e-06, | |
| "loss": 0.1587, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.4492753623188406, | |
| "grad_norm": 0.5345109793646367, | |
| "learning_rate": 6.160443572088443e-06, | |
| "loss": 0.1595, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.4604236343366779, | |
| "grad_norm": 0.49324906218088477, | |
| "learning_rate": 6.097237145128282e-06, | |
| "loss": 0.1584, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.471571906354515, | |
| "grad_norm": 0.5093141609186953, | |
| "learning_rate": 6.0338459596059395e-06, | |
| "loss": 0.1554, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.4827201783723523, | |
| "grad_norm": 0.509538948244533, | |
| "learning_rate": 5.9702806896626855e-06, | |
| "loss": 0.1585, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.4938684503901896, | |
| "grad_norm": 0.6369579950149653, | |
| "learning_rate": 5.90655203875304e-06, | |
| "loss": 0.159, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.5050167224080266, | |
| "grad_norm": 0.47625480890424343, | |
| "learning_rate": 5.8426707378424675e-06, | |
| "loss": 0.1579, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.516164994425864, | |
| "grad_norm": 0.5438272649779904, | |
| "learning_rate": 5.778647543600443e-06, | |
| "loss": 0.1564, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.5273132664437012, | |
| "grad_norm": 0.5161276266998152, | |
| "learning_rate": 5.714493236589187e-06, | |
| "loss": 0.1545, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.5384615384615383, | |
| "grad_norm": 0.5315157886776762, | |
| "learning_rate": 5.650218619448379e-06, | |
| "loss": 0.1552, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.5496098104793758, | |
| "grad_norm": 0.5390188368744134, | |
| "learning_rate": 5.5858345150761515e-06, | |
| "loss": 0.1556, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.560758082497213, | |
| "grad_norm": 0.5357636181743068, | |
| "learning_rate": 5.521351764806672e-06, | |
| "loss": 0.1568, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.57190635451505, | |
| "grad_norm": 0.4874443181407851, | |
| "learning_rate": 5.456781226584621e-06, | |
| "loss": 0.158, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.5830546265328875, | |
| "grad_norm": 0.4563465609878918, | |
| "learning_rate": 5.392133773136877e-06, | |
| "loss": 0.1602, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.5942028985507246, | |
| "grad_norm": 0.493572716982426, | |
| "learning_rate": 5.327420290141712e-06, | |
| "loss": 0.1571, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.605351170568562, | |
| "grad_norm": 0.5297224295489479, | |
| "learning_rate": 5.262651674395799e-06, | |
| "loss": 0.1567, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.6164994425863992, | |
| "grad_norm": 0.4990830580335757, | |
| "learning_rate": 5.197838831979352e-06, | |
| "loss": 0.1574, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.6276477146042363, | |
| "grad_norm": 0.4877320539367503, | |
| "learning_rate": 5.132992676419704e-06, | |
| "loss": 0.1569, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.6387959866220736, | |
| "grad_norm": 0.546166024878597, | |
| "learning_rate": 5.068124126853633e-06, | |
| "loss": 0.157, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.649944258639911, | |
| "grad_norm": 0.4770361938111255, | |
| "learning_rate": 5.0032441061887345e-06, | |
| "loss": 0.1569, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.661092530657748, | |
| "grad_norm": 0.44557527525358764, | |
| "learning_rate": 4.938363539264175e-06, | |
| "loss": 0.1571, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.6722408026755853, | |
| "grad_norm": 0.48803538459110823, | |
| "learning_rate": 4.87349335101109e-06, | |
| "loss": 0.1547, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.6722408026755853, | |
| "eval_accuracy": 0.9265380200765034, | |
| "eval_loss": 0.19943761825561523, | |
| "eval_runtime": 5.3541, | |
| "eval_samples_per_second": 42.958, | |
| "eval_steps_per_second": 1.494, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.6833890746934226, | |
| "grad_norm": 0.5076348755331102, | |
| "learning_rate": 4.808644464613015e-06, | |
| "loss": 0.1562, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.6945373467112597, | |
| "grad_norm": 0.501612702783229, | |
| "learning_rate": 4.743827799666555e-06, | |
| "loss": 0.1547, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.705685618729097, | |
| "grad_norm": 0.4879807944214394, | |
| "learning_rate": 4.679054270342703e-06, | |
| "loss": 0.1563, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.7168338907469343, | |
| "grad_norm": 0.46616141709578995, | |
| "learning_rate": 4.614334783549049e-06, | |
| "loss": 0.1538, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.7279821627647713, | |
| "grad_norm": 0.5293473205445773, | |
| "learning_rate": 4.549680237093215e-06, | |
| "loss": 0.1571, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.7391304347826086, | |
| "grad_norm": 0.4768030365700139, | |
| "learning_rate": 4.485101517847831e-06, | |
| "loss": 0.1561, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.750278706800446, | |
| "grad_norm": 0.5395794005148075, | |
| "learning_rate": 4.420609499917337e-06, | |
| "loss": 0.1555, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.761426978818283, | |
| "grad_norm": 0.4879668723657388, | |
| "learning_rate": 4.3562150428069565e-06, | |
| "loss": 0.1545, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.7725752508361206, | |
| "grad_norm": 0.4939707212729394, | |
| "learning_rate": 4.291928989594102e-06, | |
| "loss": 0.1547, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.7837235228539576, | |
| "grad_norm": 0.4807444627751433, | |
| "learning_rate": 4.2277621651025734e-06, | |
| "loss": 0.1548, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.7948717948717947, | |
| "grad_norm": 0.49281674069166814, | |
| "learning_rate": 4.163725374079808e-06, | |
| "loss": 0.1559, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.8060200668896322, | |
| "grad_norm": 0.52027039374057, | |
| "learning_rate": 4.099829399377524e-06, | |
| "loss": 0.1527, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.8171683389074693, | |
| "grad_norm": 0.4693230544446258, | |
| "learning_rate": 4.036085000136046e-06, | |
| "loss": 0.1562, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.8283166109253066, | |
| "grad_norm": 0.5063480473756882, | |
| "learning_rate": 3.972502909972628e-06, | |
| "loss": 0.1546, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.839464882943144, | |
| "grad_norm": 0.4816499638368781, | |
| "learning_rate": 3.909093835174066e-06, | |
| "loss": 0.1547, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.850613154960981, | |
| "grad_norm": 0.64084638347163, | |
| "learning_rate": 3.845868452893919e-06, | |
| "loss": 0.1546, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.8617614269788183, | |
| "grad_norm": 0.5316728198827755, | |
| "learning_rate": 3.7828374093546303e-06, | |
| "loss": 0.1526, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.8729096989966556, | |
| "grad_norm": 0.47782006941247335, | |
| "learning_rate": 3.720011318054871e-06, | |
| "loss": 0.1542, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.8840579710144927, | |
| "grad_norm": 0.4960856794921382, | |
| "learning_rate": 3.657400757982367e-06, | |
| "loss": 0.1543, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.89520624303233, | |
| "grad_norm": 0.5205359533145413, | |
| "learning_rate": 3.595016271832572e-06, | |
| "loss": 0.1531, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.9063545150501673, | |
| "grad_norm": 0.5250980924250916, | |
| "learning_rate": 3.532868364233416e-06, | |
| "loss": 0.1541, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.9175027870680044, | |
| "grad_norm": 0.4552911079184954, | |
| "learning_rate": 3.470967499976492e-06, | |
| "loss": 0.1552, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.9286510590858417, | |
| "grad_norm": 0.47374387614448155, | |
| "learning_rate": 3.4093241022549316e-06, | |
| "loss": 0.1521, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.939799331103679, | |
| "grad_norm": 0.4809046264515791, | |
| "learning_rate": 3.347948550908303e-06, | |
| "loss": 0.1536, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.950947603121516, | |
| "grad_norm": 0.4611916983834658, | |
| "learning_rate": 3.286851180674788e-06, | |
| "loss": 0.1524, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.9620958751393534, | |
| "grad_norm": 0.4496375299269273, | |
| "learning_rate": 3.2260422794509704e-06, | |
| "loss": 0.1527, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.9732441471571907, | |
| "grad_norm": 0.49594296693835754, | |
| "learning_rate": 3.165532086559504e-06, | |
| "loss": 0.1523, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.9843924191750277, | |
| "grad_norm": 0.4311413917530802, | |
| "learning_rate": 3.105330791024965e-06, | |
| "loss": 0.1528, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.9955406911928653, | |
| "grad_norm": 0.46002893870709255, | |
| "learning_rate": 3.045448529858165e-06, | |
| "loss": 0.1523, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 2.0066889632107023, | |
| "grad_norm": 0.6640864376769408, | |
| "learning_rate": 2.9858953863492334e-06, | |
| "loss": 0.1257, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.0178372352285394, | |
| "grad_norm": 0.5065513503070602, | |
| "learning_rate": 2.9266813883697342e-06, | |
| "loss": 0.111, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 2.028985507246377, | |
| "grad_norm": 0.5422041920978169, | |
| "learning_rate": 2.867816506684126e-06, | |
| "loss": 0.1125, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 2.040133779264214, | |
| "grad_norm": 0.5207289779266513, | |
| "learning_rate": 2.809310653270825e-06, | |
| "loss": 0.1113, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.051282051282051, | |
| "grad_norm": 0.4849357576510085, | |
| "learning_rate": 2.751173679653184e-06, | |
| "loss": 0.1101, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 2.0624303232998886, | |
| "grad_norm": 0.47414870888584787, | |
| "learning_rate": 2.693415375240629e-06, | |
| "loss": 0.1099, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.0735785953177257, | |
| "grad_norm": 0.47845407787457195, | |
| "learning_rate": 2.636045465680282e-06, | |
| "loss": 0.1098, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.084726867335563, | |
| "grad_norm": 0.5151744285005124, | |
| "learning_rate": 2.5790736112192893e-06, | |
| "loss": 0.1094, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 2.0958751393534003, | |
| "grad_norm": 0.44908780393249104, | |
| "learning_rate": 2.522509405078187e-06, | |
| "loss": 0.1105, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 2.1070234113712374, | |
| "grad_norm": 0.46168216362984565, | |
| "learning_rate": 2.4663623718355444e-06, | |
| "loss": 0.1102, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 2.1181716833890745, | |
| "grad_norm": 0.5003674108234805, | |
| "learning_rate": 2.410641965824166e-06, | |
| "loss": 0.1089, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.129319955406912, | |
| "grad_norm": 0.47533896359509586, | |
| "learning_rate": 2.355357569539114e-06, | |
| "loss": 0.1119, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 2.140468227424749, | |
| "grad_norm": 0.4874105883605131, | |
| "learning_rate": 2.300518492057842e-06, | |
| "loss": 0.114, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.1516164994425866, | |
| "grad_norm": 0.5034128167417606, | |
| "learning_rate": 2.2461339674726804e-06, | |
| "loss": 0.1102, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 2.1627647714604237, | |
| "grad_norm": 0.4772612798475211, | |
| "learning_rate": 2.192213153335953e-06, | |
| "loss": 0.1109, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 2.1739130434782608, | |
| "grad_norm": 0.4919335114976622, | |
| "learning_rate": 2.1387651291179775e-06, | |
| "loss": 0.1106, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.1850613154960983, | |
| "grad_norm": 0.4893367756986907, | |
| "learning_rate": 2.085798894678217e-06, | |
| "loss": 0.1089, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 2.1962095875139354, | |
| "grad_norm": 0.5106660638111142, | |
| "learning_rate": 2.0333233687498433e-06, | |
| "loss": 0.1094, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 2.2073578595317724, | |
| "grad_norm": 0.4698835768687655, | |
| "learning_rate": 1.98134738743794e-06, | |
| "loss": 0.1105, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 2.21850613154961, | |
| "grad_norm": 0.481190970836223, | |
| "learning_rate": 1.9298797027316474e-06, | |
| "loss": 0.1088, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 2.229654403567447, | |
| "grad_norm": 0.4828488140066944, | |
| "learning_rate": 1.878928981030445e-06, | |
| "loss": 0.1092, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.229654403567447, | |
| "eval_accuracy": 0.9278099495985295, | |
| "eval_loss": 0.20725835859775543, | |
| "eval_runtime": 4.9127, | |
| "eval_samples_per_second": 46.817, | |
| "eval_steps_per_second": 1.628, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.240802675585284, | |
| "grad_norm": 0.5445446664477709, | |
| "learning_rate": 1.828503801684864e-06, | |
| "loss": 0.1098, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 2.2519509476031216, | |
| "grad_norm": 0.4945888022659562, | |
| "learning_rate": 1.7786126555518484e-06, | |
| "loss": 0.1093, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 2.2630992196209587, | |
| "grad_norm": 0.4728964267184926, | |
| "learning_rate": 1.729263943565022e-06, | |
| "loss": 0.1088, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 2.274247491638796, | |
| "grad_norm": 0.46594045141875756, | |
| "learning_rate": 1.6804659753200925e-06, | |
| "loss": 0.1101, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 2.2853957636566333, | |
| "grad_norm": 0.4731193453792897, | |
| "learning_rate": 1.6322269676756402e-06, | |
| "loss": 0.1096, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 2.2965440356744704, | |
| "grad_norm": 0.46507870631813036, | |
| "learning_rate": 1.5845550433695172e-06, | |
| "loss": 0.109, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 2.3076923076923075, | |
| "grad_norm": 0.5041104454052192, | |
| "learning_rate": 1.5374582296511054e-06, | |
| "loss": 0.1097, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 2.318840579710145, | |
| "grad_norm": 0.4506495588973169, | |
| "learning_rate": 1.4909444569296334e-06, | |
| "loss": 0.1084, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 2.329988851727982, | |
| "grad_norm": 0.5081246384541027, | |
| "learning_rate": 1.4450215574388265e-06, | |
| "loss": 0.1094, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 2.3411371237458196, | |
| "grad_norm": 0.449952731509518, | |
| "learning_rate": 1.3996972639180645e-06, | |
| "loss": 0.1086, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.3522853957636567, | |
| "grad_norm": 0.5009677565037799, | |
| "learning_rate": 1.3549792083103037e-06, | |
| "loss": 0.1079, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 2.3634336677814938, | |
| "grad_norm": 0.4544414237147968, | |
| "learning_rate": 1.3108749204769732e-06, | |
| "loss": 0.1093, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 2.374581939799331, | |
| "grad_norm": 0.476556243372622, | |
| "learning_rate": 1.2673918269300557e-06, | |
| "loss": 0.109, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 2.3857302118171684, | |
| "grad_norm": 0.48618656512125896, | |
| "learning_rate": 1.2245372495815726e-06, | |
| "loss": 0.1086, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 2.3968784838350055, | |
| "grad_norm": 0.48915278085562003, | |
| "learning_rate": 1.1823184045106816e-06, | |
| "loss": 0.1088, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 2.408026755852843, | |
| "grad_norm": 0.4612765468912016, | |
| "learning_rate": 1.140742400748593e-06, | |
| "loss": 0.1085, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 2.41917502787068, | |
| "grad_norm": 0.49881271267886607, | |
| "learning_rate": 1.099816239081521e-06, | |
| "loss": 0.1084, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 2.430323299888517, | |
| "grad_norm": 0.47510269822553847, | |
| "learning_rate": 1.0595468108718448e-06, | |
| "loss": 0.1089, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 2.4414715719063547, | |
| "grad_norm": 0.47369643358112645, | |
| "learning_rate": 1.0199408968977138e-06, | |
| "loss": 0.1087, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 2.4526198439241917, | |
| "grad_norm": 0.47267131195325657, | |
| "learning_rate": 9.810051662112558e-07, | |
| "loss": 0.108, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.463768115942029, | |
| "grad_norm": 0.45402353633897424, | |
| "learning_rate": 9.427461750156142e-07, | |
| "loss": 0.1096, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 2.4749163879598663, | |
| "grad_norm": 0.46683292783796754, | |
| "learning_rate": 9.051703655609762e-07, | |
| "loss": 0.1089, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 2.4860646599777034, | |
| "grad_norm": 0.4654495260542525, | |
| "learning_rate": 8.68284065059794e-07, | |
| "loss": 0.1095, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 2.4972129319955405, | |
| "grad_norm": 0.4819165766260374, | |
| "learning_rate": 8.320934846213746e-07, | |
| "loss": 0.1082, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 2.508361204013378, | |
| "grad_norm": 0.4582207481405345, | |
| "learning_rate": 7.966047182060226e-07, | |
| "loss": 0.1083, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.519509476031215, | |
| "grad_norm": 0.44176541619456144, | |
| "learning_rate": 7.618237415989032e-07, | |
| "loss": 0.1083, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 2.5306577480490526, | |
| "grad_norm": 0.45340706173810835, | |
| "learning_rate": 7.277564114038149e-07, | |
| "loss": 0.1082, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 2.5418060200668897, | |
| "grad_norm": 0.4607933599342553, | |
| "learning_rate": 6.944084640570142e-07, | |
| "loss": 0.1072, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 2.552954292084727, | |
| "grad_norm": 0.4613886801061754, | |
| "learning_rate": 6.617855148612945e-07, | |
| "loss": 0.1071, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 2.564102564102564, | |
| "grad_norm": 0.4490390133193121, | |
| "learning_rate": 6.298930570404432e-07, | |
| "loss": 0.1087, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.5752508361204014, | |
| "grad_norm": 0.4561285734622287, | |
| "learning_rate": 5.987364608142693e-07, | |
| "loss": 0.1076, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 2.5863991081382385, | |
| "grad_norm": 0.461278673680382, | |
| "learning_rate": 5.683209724943344e-07, | |
| "loss": 0.109, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 2.597547380156076, | |
| "grad_norm": 0.45105628731245295, | |
| "learning_rate": 5.386517136005543e-07, | |
| "loss": 0.1081, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 2.608695652173913, | |
| "grad_norm": 0.438445525307319, | |
| "learning_rate": 5.097336799988067e-07, | |
| "loss": 0.1079, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 2.61984392419175, | |
| "grad_norm": 0.4457791687565759, | |
| "learning_rate": 4.815717410597042e-07, | |
| "loss": 0.1081, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.6309921962095872, | |
| "grad_norm": 0.44606421814267033, | |
| "learning_rate": 4.541706388386624e-07, | |
| "loss": 0.1086, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 2.6421404682274248, | |
| "grad_norm": 0.4398808910735919, | |
| "learning_rate": 4.275349872774098e-07, | |
| "loss": 0.1073, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 2.653288740245262, | |
| "grad_norm": 0.4522695550459701, | |
| "learning_rate": 4.0166927142706313e-07, | |
| "loss": 0.1076, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 2.6644370122630994, | |
| "grad_norm": 0.4436224867256293, | |
| "learning_rate": 3.765778466929182e-07, | |
| "loss": 0.1066, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 2.6755852842809364, | |
| "grad_norm": 0.428940667218784, | |
| "learning_rate": 3.5226493810105624e-07, | |
| "loss": 0.1081, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.6867335562987735, | |
| "grad_norm": 0.45631649150745296, | |
| "learning_rate": 3.2873463958691675e-07, | |
| "loss": 0.108, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 2.697881828316611, | |
| "grad_norm": 0.4407463233212753, | |
| "learning_rate": 3.05990913305938e-07, | |
| "loss": 0.1077, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 2.709030100334448, | |
| "grad_norm": 0.45899753246408376, | |
| "learning_rate": 2.840375889663871e-07, | |
| "loss": 0.106, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 2.7201783723522857, | |
| "grad_norm": 0.4620830771843678, | |
| "learning_rate": 2.62878363184495e-07, | |
| "loss": 0.107, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 2.7313266443701227, | |
| "grad_norm": 0.4540560827599527, | |
| "learning_rate": 2.425167988620014e-07, | |
| "loss": 0.1078, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 2.74247491638796, | |
| "grad_norm": 0.4379908641559117, | |
| "learning_rate": 2.2295632458621097e-07, | |
| "loss": 0.1076, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 2.753623188405797, | |
| "grad_norm": 0.44868425706880766, | |
| "learning_rate": 2.0420023405267663e-07, | |
| "loss": 0.1076, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 2.7647714604236344, | |
| "grad_norm": 0.47473094706270963, | |
| "learning_rate": 1.8625168551058115e-07, | |
| "loss": 0.1071, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 2.7759197324414715, | |
| "grad_norm": 0.45861955063568105, | |
| "learning_rate": 1.6911370123094238e-07, | |
| "loss": 0.1081, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 2.787068004459309, | |
| "grad_norm": 0.45384731541579043, | |
| "learning_rate": 1.5278916699770163e-07, | |
| "loss": 0.1073, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.787068004459309, | |
| "eval_accuracy": 0.9283863306145088, | |
| "eval_loss": 0.20636913180351257, | |
| "eval_runtime": 5.1195, | |
| "eval_samples_per_second": 44.926, | |
| "eval_steps_per_second": 1.563, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.798216276477146, | |
| "grad_norm": 0.4773943035056343, | |
| "learning_rate": 1.3728083162180384e-07, | |
| "loss": 0.1075, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 2.809364548494983, | |
| "grad_norm": 0.428735968014936, | |
| "learning_rate": 1.2259130647833627e-07, | |
| "loss": 0.1065, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 2.8205128205128203, | |
| "grad_norm": 0.4385864949658292, | |
| "learning_rate": 1.0872306506681251e-07, | |
| "loss": 0.1079, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 2.831661092530658, | |
| "grad_norm": 0.4743582222874101, | |
| "learning_rate": 9.567844259467051e-08, | |
| "loss": 0.1083, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 2.842809364548495, | |
| "grad_norm": 0.45862196445217895, | |
| "learning_rate": 8.345963558406001e-08, | |
| "loss": 0.1079, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 2.8539576365663324, | |
| "grad_norm": 0.45558482700348846, | |
| "learning_rate": 7.206870150197831e-08, | |
| "loss": 0.107, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 2.8651059085841695, | |
| "grad_norm": 0.4458457370599944, | |
| "learning_rate": 6.150755841382527e-08, | |
| "loss": 0.1067, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 2.8762541806020065, | |
| "grad_norm": 0.44342362029660554, | |
| "learning_rate": 5.177798466042716e-08, | |
| "loss": 0.1078, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 2.887402452619844, | |
| "grad_norm": 0.4369582199234232, | |
| "learning_rate": 4.2881618558592855e-08, | |
| "loss": 0.109, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 2.898550724637681, | |
| "grad_norm": 0.4355601557089695, | |
| "learning_rate": 3.481995812524286e-08, | |
| "loss": 0.1064, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.9096989966555182, | |
| "grad_norm": 0.4377173417059771, | |
| "learning_rate": 2.7594360825166644e-08, | |
| "loss": 0.107, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 2.9208472686733558, | |
| "grad_norm": 0.4495055773584237, | |
| "learning_rate": 2.120604334244547e-08, | |
| "loss": 0.1067, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 2.931995540691193, | |
| "grad_norm": 0.44729903103488183, | |
| "learning_rate": 1.565608137558128e-08, | |
| "loss": 0.1063, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 2.94314381270903, | |
| "grad_norm": 0.44620730977493767, | |
| "learning_rate": 1.0945409456364353e-08, | |
| "loss": 0.1075, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 2.9542920847268674, | |
| "grad_norm": 0.46052917967733736, | |
| "learning_rate": 7.0748207925103176e-09, | |
| "loss": 0.1075, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 2.9654403567447045, | |
| "grad_norm": 0.4450651551983087, | |
| "learning_rate": 4.044967134099187e-09, | |
| "loss": 0.1079, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 2.976588628762542, | |
| "grad_norm": 0.4581682413498136, | |
| "learning_rate": 1.8563586638281617e-09, | |
| "loss": 0.1064, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 2.987736900780379, | |
| "grad_norm": 0.44561324878780834, | |
| "learning_rate": 5.093639111025673e-10, | |
| "loss": 0.1071, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 2.998885172798216, | |
| "grad_norm": 0.46042910026727557, | |
| "learning_rate": 4.2096899854904285e-12, | |
| "loss": 0.1079, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 2691, | |
| "total_flos": 2978959170994176.0, | |
| "train_loss": 0.161040273276637, | |
| "train_runtime": 34957.7462, | |
| "train_samples_per_second": 19.698, | |
| "train_steps_per_second": 0.077 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2691, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2978959170994176.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |