{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 20020, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00024975024975024975, "grad_norm": 3.756018877029419, "learning_rate": 1.2468827930174565e-06, "loss": 1.9441, "step": 5 }, { "epoch": 0.0004995004995004995, "grad_norm": 2.986447334289551, "learning_rate": 2.493765586034913e-06, "loss": 1.8869, "step": 10 }, { "epoch": 0.0007492507492507493, "grad_norm": 3.6956887245178223, "learning_rate": 3.7406483790523696e-06, "loss": 1.7853, "step": 15 }, { "epoch": 0.000999000999000999, "grad_norm": 3.9595601558685303, "learning_rate": 4.987531172069826e-06, "loss": 2.2836, "step": 20 }, { "epoch": 0.0012487512487512488, "grad_norm": 3.825083017349243, "learning_rate": 6.234413965087282e-06, "loss": 1.7089, "step": 25 }, { "epoch": 0.0014985014985014985, "grad_norm": 5.073272705078125, "learning_rate": 7.481296758104739e-06, "loss": 1.7508, "step": 30 }, { "epoch": 0.0017482517482517483, "grad_norm": 10.50892448425293, "learning_rate": 8.728179551122195e-06, "loss": 1.7703, "step": 35 }, { "epoch": 0.001998001998001998, "grad_norm": 4.312236785888672, "learning_rate": 9.975062344139652e-06, "loss": 1.6257, "step": 40 }, { "epoch": 0.002247752247752248, "grad_norm": 4.734762191772461, "learning_rate": 1.1221945137157107e-05, "loss": 1.4966, "step": 45 }, { "epoch": 0.0024975024975024975, "grad_norm": 4.110627174377441, "learning_rate": 1.2468827930174564e-05, "loss": 1.4744, "step": 50 }, { "epoch": 0.0027472527472527475, "grad_norm": 4.411424160003662, "learning_rate": 1.3715710723192021e-05, "loss": 1.3441, "step": 55 }, { "epoch": 0.002997002997002997, "grad_norm": 3.845834970474243, "learning_rate": 1.4962593516209478e-05, "loss": 1.3276, "step": 60 }, { "epoch": 0.003246753246753247, "grad_norm": 4.205484390258789, "learning_rate": 1.6209476309226934e-05, "loss": 1.2852, "step": 65 }, { "epoch": 0.0034965034965034965, "grad_norm": 3.3827478885650635, "learning_rate": 1.745635910224439e-05, "loss": 1.3051, "step": 70 }, { "epoch": 0.0037462537462537465, "grad_norm": 3.255404233932495, "learning_rate": 1.8703241895261847e-05, "loss": 1.2703, "step": 75 }, { "epoch": 0.003996003996003996, "grad_norm": 3.9977900981903076, "learning_rate": 1.9950124688279304e-05, "loss": 1.2377, "step": 80 }, { "epoch": 0.004245754245754246, "grad_norm": 2.8934075832366943, "learning_rate": 2.119700748129676e-05, "loss": 1.2052, "step": 85 }, { "epoch": 0.004495504495504496, "grad_norm": 2.9269988536834717, "learning_rate": 2.2443890274314215e-05, "loss": 1.1295, "step": 90 }, { "epoch": 0.004745254745254745, "grad_norm": 3.8179385662078857, "learning_rate": 2.3690773067331672e-05, "loss": 1.2561, "step": 95 }, { "epoch": 0.004995004995004995, "grad_norm": 3.89117169380188, "learning_rate": 2.493765586034913e-05, "loss": 1.2037, "step": 100 }, { "epoch": 0.005244755244755245, "grad_norm": 3.3989474773406982, "learning_rate": 2.6184538653366586e-05, "loss": 1.1534, "step": 105 }, { "epoch": 0.005494505494505495, "grad_norm": 2.635782480239868, "learning_rate": 2.7431421446384043e-05, "loss": 1.2063, "step": 110 }, { "epoch": 0.005744255744255744, "grad_norm": 2.9206197261810303, "learning_rate": 2.86783042394015e-05, "loss": 1.3069, "step": 115 }, { "epoch": 0.005994005994005994, "grad_norm": 4.1555047035217285, "learning_rate": 2.9925187032418957e-05, "loss": 1.1143, "step": 120 }, { "epoch": 0.006243756243756244, "grad_norm": 5.164870738983154, "learning_rate": 3.1172069825436414e-05, "loss": 1.152, "step": 125 }, { "epoch": 0.006493506493506494, "grad_norm": 3.1204957962036133, "learning_rate": 3.241895261845387e-05, "loss": 1.1756, "step": 130 }, { "epoch": 0.006743256743256743, "grad_norm": 3.4282779693603516, "learning_rate": 3.366583541147133e-05, "loss": 1.1886, "step": 135 }, { "epoch": 0.006993006993006993, "grad_norm": 4.556999683380127, "learning_rate": 3.491271820448878e-05, "loss": 1.1627, "step": 140 }, { "epoch": 0.007242757242757243, "grad_norm": 3.621753454208374, "learning_rate": 3.6159600997506235e-05, "loss": 1.2224, "step": 145 }, { "epoch": 0.007492507492507493, "grad_norm": 4.2687554359436035, "learning_rate": 3.7406483790523695e-05, "loss": 1.2472, "step": 150 }, { "epoch": 0.007742257742257742, "grad_norm": 4.284986972808838, "learning_rate": 3.865336658354115e-05, "loss": 1.1662, "step": 155 }, { "epoch": 0.007992007992007992, "grad_norm": 3.8921751976013184, "learning_rate": 3.990024937655861e-05, "loss": 1.1758, "step": 160 }, { "epoch": 0.008241758241758242, "grad_norm": 4.716696739196777, "learning_rate": 4.114713216957606e-05, "loss": 1.2019, "step": 165 }, { "epoch": 0.008491508491508492, "grad_norm": 3.3180506229400635, "learning_rate": 4.239401496259352e-05, "loss": 1.0968, "step": 170 }, { "epoch": 0.008741258741258742, "grad_norm": 5.070629119873047, "learning_rate": 4.3640897755610976e-05, "loss": 1.1304, "step": 175 }, { "epoch": 0.008991008991008992, "grad_norm": 3.270987033843994, "learning_rate": 4.488778054862843e-05, "loss": 1.2013, "step": 180 }, { "epoch": 0.00924075924075924, "grad_norm": 3.250957727432251, "learning_rate": 4.613466334164589e-05, "loss": 1.1059, "step": 185 }, { "epoch": 0.00949050949050949, "grad_norm": 3.4548802375793457, "learning_rate": 4.7381546134663344e-05, "loss": 1.1336, "step": 190 }, { "epoch": 0.00974025974025974, "grad_norm": 4.061188697814941, "learning_rate": 4.8628428927680804e-05, "loss": 1.0575, "step": 195 }, { "epoch": 0.00999000999000999, "grad_norm": 3.7871272563934326, "learning_rate": 4.987531172069826e-05, "loss": 1.1317, "step": 200 }, { "epoch": 0.01023976023976024, "grad_norm": 3.428098201751709, "learning_rate": 5.112219451371572e-05, "loss": 1.1199, "step": 205 }, { "epoch": 0.01048951048951049, "grad_norm": 4.50527286529541, "learning_rate": 5.236907730673317e-05, "loss": 1.1363, "step": 210 }, { "epoch": 0.01073926073926074, "grad_norm": 5.62974214553833, "learning_rate": 5.3615960099750625e-05, "loss": 1.219, "step": 215 }, { "epoch": 0.01098901098901099, "grad_norm": 3.5278027057647705, "learning_rate": 5.4862842892768086e-05, "loss": 1.2392, "step": 220 }, { "epoch": 0.011238761238761238, "grad_norm": 3.9554367065429688, "learning_rate": 5.610972568578554e-05, "loss": 1.0812, "step": 225 }, { "epoch": 0.011488511488511488, "grad_norm": 3.35343074798584, "learning_rate": 5.7356608478803e-05, "loss": 1.1149, "step": 230 }, { "epoch": 0.011738261738261738, "grad_norm": 3.4318833351135254, "learning_rate": 5.860349127182045e-05, "loss": 1.1643, "step": 235 }, { "epoch": 0.011988011988011988, "grad_norm": 2.8330917358398438, "learning_rate": 5.985037406483791e-05, "loss": 1.0544, "step": 240 }, { "epoch": 0.012237762237762238, "grad_norm": 3.66424298286438, "learning_rate": 6.109725685785537e-05, "loss": 1.1344, "step": 245 }, { "epoch": 0.012487512487512488, "grad_norm": 4.605635643005371, "learning_rate": 6.234413965087283e-05, "loss": 1.041, "step": 250 }, { "epoch": 0.012737262737262738, "grad_norm": 3.282487392425537, "learning_rate": 6.359102244389027e-05, "loss": 1.0905, "step": 255 }, { "epoch": 0.012987012987012988, "grad_norm": 3.922199010848999, "learning_rate": 6.483790523690773e-05, "loss": 1.1558, "step": 260 }, { "epoch": 0.013236763236763236, "grad_norm": 3.9927258491516113, "learning_rate": 6.60847880299252e-05, "loss": 1.0865, "step": 265 }, { "epoch": 0.013486513486513486, "grad_norm": 2.9562907218933105, "learning_rate": 6.733167082294266e-05, "loss": 1.0997, "step": 270 }, { "epoch": 0.013736263736263736, "grad_norm": 3.530869245529175, "learning_rate": 6.85785536159601e-05, "loss": 1.1269, "step": 275 }, { "epoch": 0.013986013986013986, "grad_norm": 3.434314250946045, "learning_rate": 6.982543640897756e-05, "loss": 1.119, "step": 280 }, { "epoch": 0.014235764235764236, "grad_norm": 3.4785149097442627, "learning_rate": 7.107231920199502e-05, "loss": 1.1741, "step": 285 }, { "epoch": 0.014485514485514486, "grad_norm": 3.027099847793579, "learning_rate": 7.231920199501247e-05, "loss": 1.0403, "step": 290 }, { "epoch": 0.014735264735264736, "grad_norm": 2.498100519180298, "learning_rate": 7.356608478802993e-05, "loss": 1.1071, "step": 295 }, { "epoch": 0.014985014985014986, "grad_norm": 3.4925882816314697, "learning_rate": 7.481296758104739e-05, "loss": 1.112, "step": 300 }, { "epoch": 0.015234765234765234, "grad_norm": 4.1729736328125, "learning_rate": 7.605985037406485e-05, "loss": 1.1822, "step": 305 }, { "epoch": 0.015484515484515484, "grad_norm": 4.937356472015381, "learning_rate": 7.73067331670823e-05, "loss": 1.0706, "step": 310 }, { "epoch": 0.015734265734265736, "grad_norm": 4.950087547302246, "learning_rate": 7.855361596009976e-05, "loss": 1.231, "step": 315 }, { "epoch": 0.015984015984015984, "grad_norm": 3.0792033672332764, "learning_rate": 7.980049875311722e-05, "loss": 1.0918, "step": 320 }, { "epoch": 0.016233766233766232, "grad_norm": 2.387505054473877, "learning_rate": 8.104738154613466e-05, "loss": 1.1004, "step": 325 }, { "epoch": 0.016483516483516484, "grad_norm": 2.800842523574829, "learning_rate": 8.229426433915212e-05, "loss": 1.1141, "step": 330 }, { "epoch": 0.016733266733266732, "grad_norm": 2.915137767791748, "learning_rate": 8.354114713216959e-05, "loss": 1.1012, "step": 335 }, { "epoch": 0.016983016983016984, "grad_norm": 3.351140022277832, "learning_rate": 8.478802992518705e-05, "loss": 1.1748, "step": 340 }, { "epoch": 0.017232767232767232, "grad_norm": 3.1458077430725098, "learning_rate": 8.603491271820449e-05, "loss": 1.1699, "step": 345 }, { "epoch": 0.017482517482517484, "grad_norm": 3.345850944519043, "learning_rate": 8.728179551122195e-05, "loss": 1.1158, "step": 350 }, { "epoch": 0.017732267732267732, "grad_norm": 3.1474196910858154, "learning_rate": 8.852867830423941e-05, "loss": 1.0403, "step": 355 }, { "epoch": 0.017982017982017984, "grad_norm": 3.4693691730499268, "learning_rate": 8.977556109725686e-05, "loss": 1.1554, "step": 360 }, { "epoch": 0.018231768231768232, "grad_norm": 2.76824951171875, "learning_rate": 9.102244389027432e-05, "loss": 1.2328, "step": 365 }, { "epoch": 0.01848151848151848, "grad_norm": 3.951793670654297, "learning_rate": 9.226932668329178e-05, "loss": 1.1118, "step": 370 }, { "epoch": 0.018731268731268732, "grad_norm": 2.6339595317840576, "learning_rate": 9.351620947630924e-05, "loss": 1.1412, "step": 375 }, { "epoch": 0.01898101898101898, "grad_norm": 3.374406337738037, "learning_rate": 9.476309226932669e-05, "loss": 1.0935, "step": 380 }, { "epoch": 0.019230769230769232, "grad_norm": 3.418883800506592, "learning_rate": 9.600997506234415e-05, "loss": 1.0405, "step": 385 }, { "epoch": 0.01948051948051948, "grad_norm": 2.498117208480835, "learning_rate": 9.725685785536161e-05, "loss": 1.1132, "step": 390 }, { "epoch": 0.019730269730269732, "grad_norm": 3.406303644180298, "learning_rate": 9.850374064837906e-05, "loss": 1.1885, "step": 395 }, { "epoch": 0.01998001998001998, "grad_norm": 4.117725372314453, "learning_rate": 9.975062344139652e-05, "loss": 1.1703, "step": 400 }, { "epoch": 0.020229770229770228, "grad_norm": 3.262080192565918, "learning_rate": 9.997961160099904e-05, "loss": 1.1132, "step": 405 }, { "epoch": 0.02047952047952048, "grad_norm": 2.3854291439056396, "learning_rate": 9.995412610224782e-05, "loss": 1.148, "step": 410 }, { "epoch": 0.020729270729270728, "grad_norm": 2.444875478744507, "learning_rate": 9.992864060349661e-05, "loss": 1.1424, "step": 415 }, { "epoch": 0.02097902097902098, "grad_norm": 2.452338457107544, "learning_rate": 9.990315510474541e-05, "loss": 1.0894, "step": 420 }, { "epoch": 0.021228771228771228, "grad_norm": 2.6861560344696045, "learning_rate": 9.98776696059942e-05, "loss": 1.0443, "step": 425 }, { "epoch": 0.02147852147852148, "grad_norm": 3.2159478664398193, "learning_rate": 9.985218410724298e-05, "loss": 1.1358, "step": 430 }, { "epoch": 0.021728271728271728, "grad_norm": 2.24704647064209, "learning_rate": 9.982669860849178e-05, "loss": 1.1456, "step": 435 }, { "epoch": 0.02197802197802198, "grad_norm": 2.567363739013672, "learning_rate": 9.980121310974056e-05, "loss": 1.1589, "step": 440 }, { "epoch": 0.022227772227772228, "grad_norm": 2.6118075847625732, "learning_rate": 9.977572761098936e-05, "loss": 0.9363, "step": 445 }, { "epoch": 0.022477522477522476, "grad_norm": 3.0697689056396484, "learning_rate": 9.975024211223815e-05, "loss": 1.1437, "step": 450 }, { "epoch": 0.022727272727272728, "grad_norm": 2.219345808029175, "learning_rate": 9.972475661348693e-05, "loss": 1.0883, "step": 455 }, { "epoch": 0.022977022977022976, "grad_norm": 2.4899234771728516, "learning_rate": 9.969927111473572e-05, "loss": 1.0733, "step": 460 }, { "epoch": 0.023226773226773228, "grad_norm": 2.4743125438690186, "learning_rate": 9.96737856159845e-05, "loss": 1.1267, "step": 465 }, { "epoch": 0.023476523476523476, "grad_norm": 2.7509214878082275, "learning_rate": 9.964830011723329e-05, "loss": 1.119, "step": 470 }, { "epoch": 0.023726273726273728, "grad_norm": 2.4172253608703613, "learning_rate": 9.962281461848209e-05, "loss": 1.1147, "step": 475 }, { "epoch": 0.023976023976023976, "grad_norm": 2.1590776443481445, "learning_rate": 9.959732911973087e-05, "loss": 1.1093, "step": 480 }, { "epoch": 0.024225774225774224, "grad_norm": 1.8928271532058716, "learning_rate": 9.957184362097966e-05, "loss": 1.1545, "step": 485 }, { "epoch": 0.024475524475524476, "grad_norm": 2.209320068359375, "learning_rate": 9.954635812222846e-05, "loss": 1.0518, "step": 490 }, { "epoch": 0.024725274725274724, "grad_norm": 2.86975359916687, "learning_rate": 9.952087262347724e-05, "loss": 1.0435, "step": 495 }, { "epoch": 0.024975024975024976, "grad_norm": 2.1670563220977783, "learning_rate": 9.949538712472604e-05, "loss": 1.1035, "step": 500 }, { "epoch": 0.025224775224775224, "grad_norm": 2.6216042041778564, "learning_rate": 9.946990162597483e-05, "loss": 1.1316, "step": 505 }, { "epoch": 0.025474525474525476, "grad_norm": 2.534996271133423, "learning_rate": 9.944441612722361e-05, "loss": 1.2497, "step": 510 }, { "epoch": 0.025724275724275724, "grad_norm": 2.2985408306121826, "learning_rate": 9.941893062847241e-05, "loss": 1.1563, "step": 515 }, { "epoch": 0.025974025974025976, "grad_norm": 3.9628427028656006, "learning_rate": 9.93934451297212e-05, "loss": 1.0487, "step": 520 }, { "epoch": 0.026223776223776224, "grad_norm": 2.162043809890747, "learning_rate": 9.936795963096998e-05, "loss": 0.9919, "step": 525 }, { "epoch": 0.026473526473526472, "grad_norm": 2.4366073608398438, "learning_rate": 9.934247413221878e-05, "loss": 0.9541, "step": 530 }, { "epoch": 0.026723276723276724, "grad_norm": 2.980107545852661, "learning_rate": 9.931698863346757e-05, "loss": 1.1071, "step": 535 }, { "epoch": 0.026973026973026972, "grad_norm": 1.9352530241012573, "learning_rate": 9.929150313471634e-05, "loss": 1.0977, "step": 540 }, { "epoch": 0.027222777222777224, "grad_norm": 2.827826738357544, "learning_rate": 9.926601763596514e-05, "loss": 1.0033, "step": 545 }, { "epoch": 0.027472527472527472, "grad_norm": 2.5779693126678467, "learning_rate": 9.924053213721392e-05, "loss": 1.0334, "step": 550 }, { "epoch": 0.027722277722277724, "grad_norm": 2.095482587814331, "learning_rate": 9.921504663846272e-05, "loss": 1.0952, "step": 555 }, { "epoch": 0.027972027972027972, "grad_norm": 1.860028624534607, "learning_rate": 9.918956113971151e-05, "loss": 0.9497, "step": 560 }, { "epoch": 0.02822177822177822, "grad_norm": 2.573823928833008, "learning_rate": 9.91640756409603e-05, "loss": 1.0571, "step": 565 }, { "epoch": 0.028471528471528472, "grad_norm": 2.9577178955078125, "learning_rate": 9.913859014220909e-05, "loss": 1.1632, "step": 570 }, { "epoch": 0.02872127872127872, "grad_norm": 3.2574944496154785, "learning_rate": 9.911310464345788e-05, "loss": 1.1144, "step": 575 }, { "epoch": 0.028971028971028972, "grad_norm": 2.675922155380249, "learning_rate": 9.908761914470666e-05, "loss": 1.0838, "step": 580 }, { "epoch": 0.02922077922077922, "grad_norm": 3.7051026821136475, "learning_rate": 9.906213364595546e-05, "loss": 1.0348, "step": 585 }, { "epoch": 0.029470529470529472, "grad_norm": 3.37589693069458, "learning_rate": 9.903664814720425e-05, "loss": 1.1155, "step": 590 }, { "epoch": 0.02972027972027972, "grad_norm": 2.3037402629852295, "learning_rate": 9.901116264845303e-05, "loss": 1.0763, "step": 595 }, { "epoch": 0.029970029970029972, "grad_norm": 2.622713088989258, "learning_rate": 9.898567714970183e-05, "loss": 1.1692, "step": 600 }, { "epoch": 0.03021978021978022, "grad_norm": 2.8527050018310547, "learning_rate": 9.896019165095062e-05, "loss": 1.1301, "step": 605 }, { "epoch": 0.030469530469530468, "grad_norm": 2.476747512817383, "learning_rate": 9.89347061521994e-05, "loss": 1.2133, "step": 610 }, { "epoch": 0.03071928071928072, "grad_norm": 2.484840154647827, "learning_rate": 9.89092206534482e-05, "loss": 1.1192, "step": 615 }, { "epoch": 0.030969030969030968, "grad_norm": 2.154974937438965, "learning_rate": 9.888373515469697e-05, "loss": 1.0691, "step": 620 }, { "epoch": 0.03121878121878122, "grad_norm": 1.9163360595703125, "learning_rate": 9.885824965594577e-05, "loss": 1.1043, "step": 625 }, { "epoch": 0.03146853146853147, "grad_norm": 2.6048336029052734, "learning_rate": 9.883276415719456e-05, "loss": 1.1289, "step": 630 }, { "epoch": 0.03171828171828172, "grad_norm": 2.598798990249634, "learning_rate": 9.880727865844334e-05, "loss": 1.1225, "step": 635 }, { "epoch": 0.03196803196803197, "grad_norm": 2.245591878890991, "learning_rate": 9.878179315969214e-05, "loss": 1.0474, "step": 640 }, { "epoch": 0.032217782217782216, "grad_norm": 2.1708250045776367, "learning_rate": 9.875630766094093e-05, "loss": 1.064, "step": 645 }, { "epoch": 0.032467532467532464, "grad_norm": 2.4297585487365723, "learning_rate": 9.873082216218971e-05, "loss": 1.2008, "step": 650 }, { "epoch": 0.03271728271728272, "grad_norm": 2.556823968887329, "learning_rate": 9.870533666343851e-05, "loss": 1.1179, "step": 655 }, { "epoch": 0.03296703296703297, "grad_norm": 2.9810869693756104, "learning_rate": 9.86798511646873e-05, "loss": 1.1155, "step": 660 }, { "epoch": 0.033216783216783216, "grad_norm": 2.2953784465789795, "learning_rate": 9.865436566593608e-05, "loss": 1.0836, "step": 665 }, { "epoch": 0.033466533466533464, "grad_norm": 1.9675393104553223, "learning_rate": 9.862888016718488e-05, "loss": 1.0758, "step": 670 }, { "epoch": 0.03371628371628372, "grad_norm": 2.156248092651367, "learning_rate": 9.860339466843367e-05, "loss": 1.1121, "step": 675 }, { "epoch": 0.03396603396603397, "grad_norm": 2.1987199783325195, "learning_rate": 9.857790916968245e-05, "loss": 1.108, "step": 680 }, { "epoch": 0.034215784215784216, "grad_norm": 2.3923282623291016, "learning_rate": 9.855242367093125e-05, "loss": 1.1072, "step": 685 }, { "epoch": 0.034465534465534464, "grad_norm": 3.898613691329956, "learning_rate": 9.852693817218004e-05, "loss": 1.0121, "step": 690 }, { "epoch": 0.03471528471528471, "grad_norm": 2.1221420764923096, "learning_rate": 9.850145267342882e-05, "loss": 1.0311, "step": 695 }, { "epoch": 0.03496503496503497, "grad_norm": 1.8849890232086182, "learning_rate": 9.847596717467761e-05, "loss": 1.1659, "step": 700 }, { "epoch": 0.035214785214785216, "grad_norm": 1.9260637760162354, "learning_rate": 9.84504816759264e-05, "loss": 1.0847, "step": 705 }, { "epoch": 0.035464535464535464, "grad_norm": 2.4071691036224365, "learning_rate": 9.842499617717519e-05, "loss": 1.1216, "step": 710 }, { "epoch": 0.03571428571428571, "grad_norm": 2.488605499267578, "learning_rate": 9.839951067842398e-05, "loss": 1.0576, "step": 715 }, { "epoch": 0.03596403596403597, "grad_norm": 2.3075785636901855, "learning_rate": 9.837402517967276e-05, "loss": 1.0007, "step": 720 }, { "epoch": 0.036213786213786216, "grad_norm": 3.2621097564697266, "learning_rate": 9.834853968092156e-05, "loss": 1.1093, "step": 725 }, { "epoch": 0.036463536463536464, "grad_norm": 2.0933992862701416, "learning_rate": 9.832305418217035e-05, "loss": 1.1519, "step": 730 }, { "epoch": 0.03671328671328671, "grad_norm": 1.8631466627120972, "learning_rate": 9.829756868341913e-05, "loss": 1.0998, "step": 735 }, { "epoch": 0.03696303696303696, "grad_norm": 2.5202832221984863, "learning_rate": 9.827208318466793e-05, "loss": 1.1146, "step": 740 }, { "epoch": 0.037212787212787216, "grad_norm": 2.7940120697021484, "learning_rate": 9.824659768591672e-05, "loss": 1.1027, "step": 745 }, { "epoch": 0.037462537462537464, "grad_norm": 2.038707733154297, "learning_rate": 9.82211121871655e-05, "loss": 0.9527, "step": 750 }, { "epoch": 0.03771228771228771, "grad_norm": 2.4933929443359375, "learning_rate": 9.81956266884143e-05, "loss": 1.0776, "step": 755 }, { "epoch": 0.03796203796203796, "grad_norm": 1.950372576713562, "learning_rate": 9.817014118966309e-05, "loss": 1.1775, "step": 760 }, { "epoch": 0.03821178821178821, "grad_norm": 2.3740854263305664, "learning_rate": 9.814465569091189e-05, "loss": 0.9998, "step": 765 }, { "epoch": 0.038461538461538464, "grad_norm": 2.2166638374328613, "learning_rate": 9.811917019216067e-05, "loss": 1.1058, "step": 770 }, { "epoch": 0.03871128871128871, "grad_norm": 2.9701955318450928, "learning_rate": 9.809368469340946e-05, "loss": 1.1302, "step": 775 }, { "epoch": 0.03896103896103896, "grad_norm": 2.297008514404297, "learning_rate": 9.806819919465824e-05, "loss": 0.9911, "step": 780 }, { "epoch": 0.03921078921078921, "grad_norm": 1.9161550998687744, "learning_rate": 9.804271369590703e-05, "loss": 1.0391, "step": 785 }, { "epoch": 0.039460539460539464, "grad_norm": 2.607980966567993, "learning_rate": 9.801722819715581e-05, "loss": 1.1763, "step": 790 }, { "epoch": 0.03971028971028971, "grad_norm": 1.948833703994751, "learning_rate": 9.799174269840461e-05, "loss": 1.0667, "step": 795 }, { "epoch": 0.03996003996003996, "grad_norm": 2.3056819438934326, "learning_rate": 9.79662571996534e-05, "loss": 1.1657, "step": 800 }, { "epoch": 0.04020979020979021, "grad_norm": 2.2535955905914307, "learning_rate": 9.79407717009022e-05, "loss": 1.0665, "step": 805 }, { "epoch": 0.040459540459540456, "grad_norm": 2.0203304290771484, "learning_rate": 9.791528620215098e-05, "loss": 1.0119, "step": 810 }, { "epoch": 0.04070929070929071, "grad_norm": 1.9329692125320435, "learning_rate": 9.788980070339977e-05, "loss": 1.1206, "step": 815 }, { "epoch": 0.04095904095904096, "grad_norm": 2.2010931968688965, "learning_rate": 9.786431520464857e-05, "loss": 1.1077, "step": 820 }, { "epoch": 0.04120879120879121, "grad_norm": 1.9428123235702515, "learning_rate": 9.783882970589735e-05, "loss": 1.0229, "step": 825 }, { "epoch": 0.041458541458541456, "grad_norm": 2.2056124210357666, "learning_rate": 9.781334420714614e-05, "loss": 1.0409, "step": 830 }, { "epoch": 0.04170829170829171, "grad_norm": 1.9974044561386108, "learning_rate": 9.778785870839494e-05, "loss": 1.0594, "step": 835 }, { "epoch": 0.04195804195804196, "grad_norm": 2.656376600265503, "learning_rate": 9.776237320964372e-05, "loss": 1.1234, "step": 840 }, { "epoch": 0.04220779220779221, "grad_norm": 2.5825204849243164, "learning_rate": 9.77368877108925e-05, "loss": 1.1449, "step": 845 }, { "epoch": 0.042457542457542456, "grad_norm": 2.2040162086486816, "learning_rate": 9.77114022121413e-05, "loss": 1.0975, "step": 850 }, { "epoch": 0.042707292707292704, "grad_norm": 2.18552303314209, "learning_rate": 9.768591671339009e-05, "loss": 1.1024, "step": 855 }, { "epoch": 0.04295704295704296, "grad_norm": 2.434199810028076, "learning_rate": 9.766043121463888e-05, "loss": 1.166, "step": 860 }, { "epoch": 0.04320679320679321, "grad_norm": 3.678466796875, "learning_rate": 9.763494571588766e-05, "loss": 1.1151, "step": 865 }, { "epoch": 0.043456543456543456, "grad_norm": 1.7375770807266235, "learning_rate": 9.760946021713645e-05, "loss": 1.1912, "step": 870 }, { "epoch": 0.043706293706293704, "grad_norm": 2.0483288764953613, "learning_rate": 9.758397471838525e-05, "loss": 1.0888, "step": 875 }, { "epoch": 0.04395604395604396, "grad_norm": 1.8642252683639526, "learning_rate": 9.755848921963403e-05, "loss": 1.0843, "step": 880 }, { "epoch": 0.04420579420579421, "grad_norm": 2.947230577468872, "learning_rate": 9.753300372088282e-05, "loss": 1.0847, "step": 885 }, { "epoch": 0.044455544455544456, "grad_norm": 1.695863962173462, "learning_rate": 9.750751822213162e-05, "loss": 1.0545, "step": 890 }, { "epoch": 0.044705294705294704, "grad_norm": 2.2340335845947266, "learning_rate": 9.74820327233804e-05, "loss": 1.1854, "step": 895 }, { "epoch": 0.04495504495504495, "grad_norm": 1.8246809244155884, "learning_rate": 9.745654722462919e-05, "loss": 1.1257, "step": 900 }, { "epoch": 0.04520479520479521, "grad_norm": 1.933232069015503, "learning_rate": 9.743106172587799e-05, "loss": 1.1365, "step": 905 }, { "epoch": 0.045454545454545456, "grad_norm": 2.5051841735839844, "learning_rate": 9.740557622712677e-05, "loss": 1.0741, "step": 910 }, { "epoch": 0.045704295704295704, "grad_norm": 2.0148048400878906, "learning_rate": 9.738009072837556e-05, "loss": 1.1837, "step": 915 }, { "epoch": 0.04595404595404595, "grad_norm": 2.2304186820983887, "learning_rate": 9.735460522962436e-05, "loss": 1.0762, "step": 920 }, { "epoch": 0.0462037962037962, "grad_norm": 2.2290730476379395, "learning_rate": 9.732911973087314e-05, "loss": 1.0584, "step": 925 }, { "epoch": 0.046453546453546456, "grad_norm": 2.367117404937744, "learning_rate": 9.730363423212193e-05, "loss": 1.0668, "step": 930 }, { "epoch": 0.046703296703296704, "grad_norm": 1.5657598972320557, "learning_rate": 9.727814873337073e-05, "loss": 1.1219, "step": 935 }, { "epoch": 0.04695304695304695, "grad_norm": 2.5166282653808594, "learning_rate": 9.72526632346195e-05, "loss": 0.9858, "step": 940 }, { "epoch": 0.0472027972027972, "grad_norm": 2.0858988761901855, "learning_rate": 9.72271777358683e-05, "loss": 1.0373, "step": 945 }, { "epoch": 0.047452547452547456, "grad_norm": 3.5959551334381104, "learning_rate": 9.720169223711708e-05, "loss": 1.1164, "step": 950 }, { "epoch": 0.047702297702297704, "grad_norm": 1.855296015739441, "learning_rate": 9.717620673836587e-05, "loss": 1.0979, "step": 955 }, { "epoch": 0.04795204795204795, "grad_norm": 2.7213802337646484, "learning_rate": 9.715072123961467e-05, "loss": 1.0705, "step": 960 }, { "epoch": 0.0482017982017982, "grad_norm": 2.8437836170196533, "learning_rate": 9.712523574086345e-05, "loss": 1.0347, "step": 965 }, { "epoch": 0.04845154845154845, "grad_norm": 2.338642120361328, "learning_rate": 9.709975024211224e-05, "loss": 1.1293, "step": 970 }, { "epoch": 0.048701298701298704, "grad_norm": 1.965152382850647, "learning_rate": 9.707426474336104e-05, "loss": 1.169, "step": 975 }, { "epoch": 0.04895104895104895, "grad_norm": 1.834064245223999, "learning_rate": 9.704877924460982e-05, "loss": 1.0812, "step": 980 }, { "epoch": 0.0492007992007992, "grad_norm": 3.00486159324646, "learning_rate": 9.70232937458586e-05, "loss": 1.0097, "step": 985 }, { "epoch": 0.04945054945054945, "grad_norm": 1.6730338335037231, "learning_rate": 9.69978082471074e-05, "loss": 1.0533, "step": 990 }, { "epoch": 0.049700299700299704, "grad_norm": 3.562251091003418, "learning_rate": 9.697232274835619e-05, "loss": 1.0061, "step": 995 }, { "epoch": 0.04995004995004995, "grad_norm": 2.3844172954559326, "learning_rate": 9.694683724960498e-05, "loss": 1.0792, "step": 1000 }, { "epoch": 0.0501998001998002, "grad_norm": 2.211378574371338, "learning_rate": 9.692135175085378e-05, "loss": 1.0917, "step": 1005 }, { "epoch": 0.05044955044955045, "grad_norm": 1.959848165512085, "learning_rate": 9.689586625210256e-05, "loss": 1.0101, "step": 1010 }, { "epoch": 0.050699300699300696, "grad_norm": 2.372511863708496, "learning_rate": 9.687038075335136e-05, "loss": 1.0214, "step": 1015 }, { "epoch": 0.05094905094905095, "grad_norm": 3.2810800075531006, "learning_rate": 9.684489525460013e-05, "loss": 1.0616, "step": 1020 }, { "epoch": 0.0511988011988012, "grad_norm": 2.1822047233581543, "learning_rate": 9.681940975584892e-05, "loss": 1.1232, "step": 1025 }, { "epoch": 0.05144855144855145, "grad_norm": 1.9162653684616089, "learning_rate": 9.679392425709772e-05, "loss": 1.0815, "step": 1030 }, { "epoch": 0.051698301698301696, "grad_norm": 3.648998737335205, "learning_rate": 9.67684387583465e-05, "loss": 1.0012, "step": 1035 }, { "epoch": 0.05194805194805195, "grad_norm": 2.0620384216308594, "learning_rate": 9.674295325959529e-05, "loss": 0.9978, "step": 1040 }, { "epoch": 0.0521978021978022, "grad_norm": 2.799177408218384, "learning_rate": 9.671746776084409e-05, "loss": 0.9938, "step": 1045 }, { "epoch": 0.05244755244755245, "grad_norm": 2.409482002258301, "learning_rate": 9.669198226209287e-05, "loss": 1.166, "step": 1050 }, { "epoch": 0.052697302697302696, "grad_norm": 1.667229413986206, "learning_rate": 9.666649676334166e-05, "loss": 0.9709, "step": 1055 }, { "epoch": 0.052947052947052944, "grad_norm": 2.2200872898101807, "learning_rate": 9.664101126459046e-05, "loss": 1.1259, "step": 1060 }, { "epoch": 0.0531968031968032, "grad_norm": 2.714712381362915, "learning_rate": 9.661552576583924e-05, "loss": 1.0192, "step": 1065 }, { "epoch": 0.05344655344655345, "grad_norm": 2.3864729404449463, "learning_rate": 9.659004026708804e-05, "loss": 1.1227, "step": 1070 }, { "epoch": 0.053696303696303696, "grad_norm": 3.0118916034698486, "learning_rate": 9.656455476833682e-05, "loss": 1.0826, "step": 1075 }, { "epoch": 0.053946053946053944, "grad_norm": 1.8575149774551392, "learning_rate": 9.653906926958561e-05, "loss": 1.088, "step": 1080 }, { "epoch": 0.05419580419580419, "grad_norm": 1.9772404432296753, "learning_rate": 9.651358377083441e-05, "loss": 1.0153, "step": 1085 }, { "epoch": 0.05444555444555445, "grad_norm": 1.84340238571167, "learning_rate": 9.64880982720832e-05, "loss": 1.1591, "step": 1090 }, { "epoch": 0.054695304695304696, "grad_norm": 2.54854416847229, "learning_rate": 9.646261277333198e-05, "loss": 1.0241, "step": 1095 }, { "epoch": 0.054945054945054944, "grad_norm": 2.298405647277832, "learning_rate": 9.643712727458077e-05, "loss": 1.1486, "step": 1100 }, { "epoch": 0.05519480519480519, "grad_norm": 2.1085455417633057, "learning_rate": 9.641164177582955e-05, "loss": 1.0114, "step": 1105 }, { "epoch": 0.05544455544455545, "grad_norm": 2.9504263401031494, "learning_rate": 9.638615627707835e-05, "loss": 1.105, "step": 1110 }, { "epoch": 0.055694305694305696, "grad_norm": 1.9672647714614868, "learning_rate": 9.636067077832714e-05, "loss": 1.0401, "step": 1115 }, { "epoch": 0.055944055944055944, "grad_norm": 1.841833233833313, "learning_rate": 9.633518527957592e-05, "loss": 1.1139, "step": 1120 }, { "epoch": 0.05619380619380619, "grad_norm": 2.0840795040130615, "learning_rate": 9.630969978082472e-05, "loss": 1.0855, "step": 1125 }, { "epoch": 0.05644355644355644, "grad_norm": 2.7331795692443848, "learning_rate": 9.62842142820735e-05, "loss": 1.0436, "step": 1130 }, { "epoch": 0.056693306693306696, "grad_norm": 2.174517869949341, "learning_rate": 9.625872878332229e-05, "loss": 0.9379, "step": 1135 }, { "epoch": 0.056943056943056944, "grad_norm": 2.8493294715881348, "learning_rate": 9.623324328457109e-05, "loss": 1.0385, "step": 1140 }, { "epoch": 0.05719280719280719, "grad_norm": 1.9348623752593994, "learning_rate": 9.620775778581987e-05, "loss": 1.1573, "step": 1145 }, { "epoch": 0.05744255744255744, "grad_norm": 1.9528578519821167, "learning_rate": 9.618227228706866e-05, "loss": 1.0287, "step": 1150 }, { "epoch": 0.057692307692307696, "grad_norm": 1.7765041589736938, "learning_rate": 9.615678678831746e-05, "loss": 1.1262, "step": 1155 }, { "epoch": 0.057942057942057944, "grad_norm": 2.2895193099975586, "learning_rate": 9.613130128956624e-05, "loss": 1.0944, "step": 1160 }, { "epoch": 0.05819180819180819, "grad_norm": 1.731370210647583, "learning_rate": 9.610581579081503e-05, "loss": 1.0653, "step": 1165 }, { "epoch": 0.05844155844155844, "grad_norm": 2.088162660598755, "learning_rate": 9.608033029206383e-05, "loss": 1.0958, "step": 1170 }, { "epoch": 0.05869130869130869, "grad_norm": 1.8357365131378174, "learning_rate": 9.605484479331261e-05, "loss": 1.044, "step": 1175 }, { "epoch": 0.058941058941058944, "grad_norm": 2.6642401218414307, "learning_rate": 9.60293592945614e-05, "loss": 1.0639, "step": 1180 }, { "epoch": 0.05919080919080919, "grad_norm": 1.9594494104385376, "learning_rate": 9.600387379581019e-05, "loss": 1.1335, "step": 1185 }, { "epoch": 0.05944055944055944, "grad_norm": 1.8316797018051147, "learning_rate": 9.597838829705897e-05, "loss": 1.0021, "step": 1190 }, { "epoch": 0.05969030969030969, "grad_norm": 1.7587838172912598, "learning_rate": 9.595290279830777e-05, "loss": 1.0629, "step": 1195 }, { "epoch": 0.059940059940059943, "grad_norm": 2.5461153984069824, "learning_rate": 9.592741729955655e-05, "loss": 1.1183, "step": 1200 }, { "epoch": 0.06018981018981019, "grad_norm": 1.6729899644851685, "learning_rate": 9.590193180080534e-05, "loss": 1.197, "step": 1205 }, { "epoch": 0.06043956043956044, "grad_norm": 1.928887963294983, "learning_rate": 9.587644630205414e-05, "loss": 1.0389, "step": 1210 }, { "epoch": 0.06068931068931069, "grad_norm": 2.454225778579712, "learning_rate": 9.585096080330292e-05, "loss": 1.1014, "step": 1215 }, { "epoch": 0.060939060939060936, "grad_norm": 2.664700746536255, "learning_rate": 9.582547530455171e-05, "loss": 1.0074, "step": 1220 }, { "epoch": 0.06118881118881119, "grad_norm": 1.7048535346984863, "learning_rate": 9.579998980580051e-05, "loss": 1.1905, "step": 1225 }, { "epoch": 0.06143856143856144, "grad_norm": 2.1960554122924805, "learning_rate": 9.57745043070493e-05, "loss": 1.0857, "step": 1230 }, { "epoch": 0.06168831168831169, "grad_norm": 2.3340139389038086, "learning_rate": 9.574901880829808e-05, "loss": 1.0855, "step": 1235 }, { "epoch": 0.061938061938061936, "grad_norm": 1.953961730003357, "learning_rate": 9.572353330954688e-05, "loss": 1.0418, "step": 1240 }, { "epoch": 0.062187812187812185, "grad_norm": 2.0407261848449707, "learning_rate": 9.569804781079566e-05, "loss": 1.0792, "step": 1245 }, { "epoch": 0.06243756243756244, "grad_norm": 1.8920087814331055, "learning_rate": 9.567256231204445e-05, "loss": 1.0715, "step": 1250 }, { "epoch": 0.06268731268731269, "grad_norm": 2.353179454803467, "learning_rate": 9.564707681329325e-05, "loss": 1.0926, "step": 1255 }, { "epoch": 0.06293706293706294, "grad_norm": 2.2123873233795166, "learning_rate": 9.562159131454202e-05, "loss": 1.0905, "step": 1260 }, { "epoch": 0.06318681318681318, "grad_norm": 1.9480962753295898, "learning_rate": 9.559610581579082e-05, "loss": 1.1224, "step": 1265 }, { "epoch": 0.06343656343656344, "grad_norm": 2.8749778270721436, "learning_rate": 9.55706203170396e-05, "loss": 1.0514, "step": 1270 }, { "epoch": 0.06368631368631368, "grad_norm": 3.9483730792999268, "learning_rate": 9.554513481828839e-05, "loss": 1.054, "step": 1275 }, { "epoch": 0.06393606393606394, "grad_norm": 1.7393423318862915, "learning_rate": 9.551964931953719e-05, "loss": 1.1657, "step": 1280 }, { "epoch": 0.06418581418581419, "grad_norm": 1.7806570529937744, "learning_rate": 9.549416382078597e-05, "loss": 0.9378, "step": 1285 }, { "epoch": 0.06443556443556443, "grad_norm": 1.8988239765167236, "learning_rate": 9.546867832203476e-05, "loss": 1.0344, "step": 1290 }, { "epoch": 0.06468531468531469, "grad_norm": 1.988075852394104, "learning_rate": 9.544319282328356e-05, "loss": 1.076, "step": 1295 }, { "epoch": 0.06493506493506493, "grad_norm": 1.8742318153381348, "learning_rate": 9.541770732453234e-05, "loss": 1.127, "step": 1300 }, { "epoch": 0.06518481518481518, "grad_norm": 2.171926259994507, "learning_rate": 9.539222182578113e-05, "loss": 1.1676, "step": 1305 }, { "epoch": 0.06543456543456544, "grad_norm": 1.6614676713943481, "learning_rate": 9.536673632702993e-05, "loss": 0.9695, "step": 1310 }, { "epoch": 0.06568431568431568, "grad_norm": 2.1950104236602783, "learning_rate": 9.534125082827871e-05, "loss": 1.0504, "step": 1315 }, { "epoch": 0.06593406593406594, "grad_norm": 2.9726340770721436, "learning_rate": 9.531576532952751e-05, "loss": 0.9434, "step": 1320 }, { "epoch": 0.06618381618381618, "grad_norm": 3.9498984813690186, "learning_rate": 9.52902798307763e-05, "loss": 1.085, "step": 1325 }, { "epoch": 0.06643356643356643, "grad_norm": 1.8510884046554565, "learning_rate": 9.526479433202508e-05, "loss": 1.1261, "step": 1330 }, { "epoch": 0.06668331668331669, "grad_norm": 3.170361042022705, "learning_rate": 9.523930883327388e-05, "loss": 1.0072, "step": 1335 }, { "epoch": 0.06693306693306693, "grad_norm": 1.8833476305007935, "learning_rate": 9.521382333452265e-05, "loss": 1.0844, "step": 1340 }, { "epoch": 0.06718281718281718, "grad_norm": 1.8673526048660278, "learning_rate": 9.518833783577144e-05, "loss": 1.0186, "step": 1345 }, { "epoch": 0.06743256743256744, "grad_norm": 2.5537397861480713, "learning_rate": 9.516285233702024e-05, "loss": 1.0558, "step": 1350 }, { "epoch": 0.06768231768231768, "grad_norm": 2.046783208847046, "learning_rate": 9.513736683826902e-05, "loss": 1.0455, "step": 1355 }, { "epoch": 0.06793206793206794, "grad_norm": 1.8869708776474, "learning_rate": 9.511188133951781e-05, "loss": 1.0464, "step": 1360 }, { "epoch": 0.06818181818181818, "grad_norm": 1.8103328943252563, "learning_rate": 9.508639584076661e-05, "loss": 1.0643, "step": 1365 }, { "epoch": 0.06843156843156843, "grad_norm": 1.8245409727096558, "learning_rate": 9.50609103420154e-05, "loss": 1.0346, "step": 1370 }, { "epoch": 0.06868131868131869, "grad_norm": 2.0209765434265137, "learning_rate": 9.50354248432642e-05, "loss": 1.1622, "step": 1375 }, { "epoch": 0.06893106893106893, "grad_norm": 1.98048996925354, "learning_rate": 9.500993934451298e-05, "loss": 1.0625, "step": 1380 }, { "epoch": 0.06918081918081918, "grad_norm": 2.1545944213867188, "learning_rate": 9.498445384576176e-05, "loss": 1.116, "step": 1385 }, { "epoch": 0.06943056943056942, "grad_norm": 2.4843201637268066, "learning_rate": 9.495896834701056e-05, "loss": 1.0728, "step": 1390 }, { "epoch": 0.06968031968031968, "grad_norm": 2.090421676635742, "learning_rate": 9.493348284825935e-05, "loss": 0.9982, "step": 1395 }, { "epoch": 0.06993006993006994, "grad_norm": 2.3593664169311523, "learning_rate": 9.490799734950813e-05, "loss": 1.0449, "step": 1400 }, { "epoch": 0.07017982017982018, "grad_norm": 1.8505982160568237, "learning_rate": 9.488251185075693e-05, "loss": 1.083, "step": 1405 }, { "epoch": 0.07042957042957043, "grad_norm": 1.4218089580535889, "learning_rate": 9.485702635200572e-05, "loss": 1.0094, "step": 1410 }, { "epoch": 0.07067932067932067, "grad_norm": 1.7999874353408813, "learning_rate": 9.48315408532545e-05, "loss": 1.0699, "step": 1415 }, { "epoch": 0.07092907092907093, "grad_norm": 2.2292604446411133, "learning_rate": 9.480605535450329e-05, "loss": 1.1843, "step": 1420 }, { "epoch": 0.07117882117882118, "grad_norm": 2.1728875637054443, "learning_rate": 9.478056985575207e-05, "loss": 0.9808, "step": 1425 }, { "epoch": 0.07142857142857142, "grad_norm": 2.2584636211395264, "learning_rate": 9.475508435700087e-05, "loss": 1.1534, "step": 1430 }, { "epoch": 0.07167832167832168, "grad_norm": 1.9276480674743652, "learning_rate": 9.472959885824966e-05, "loss": 1.046, "step": 1435 }, { "epoch": 0.07192807192807193, "grad_norm": 2.4217398166656494, "learning_rate": 9.470411335949844e-05, "loss": 1.2047, "step": 1440 }, { "epoch": 0.07217782217782218, "grad_norm": 1.8861360549926758, "learning_rate": 9.467862786074724e-05, "loss": 1.0244, "step": 1445 }, { "epoch": 0.07242757242757243, "grad_norm": 1.5331791639328003, "learning_rate": 9.465314236199603e-05, "loss": 1.0758, "step": 1450 }, { "epoch": 0.07267732267732267, "grad_norm": 2.146282196044922, "learning_rate": 9.462765686324481e-05, "loss": 1.0951, "step": 1455 }, { "epoch": 0.07292707292707293, "grad_norm": 2.0330257415771484, "learning_rate": 9.460217136449361e-05, "loss": 1.0483, "step": 1460 }, { "epoch": 0.07317682317682318, "grad_norm": 2.2604196071624756, "learning_rate": 9.45766858657424e-05, "loss": 1.1073, "step": 1465 }, { "epoch": 0.07342657342657342, "grad_norm": 1.981196403503418, "learning_rate": 9.455120036699118e-05, "loss": 1.1375, "step": 1470 }, { "epoch": 0.07367632367632368, "grad_norm": 1.8137750625610352, "learning_rate": 9.452571486823998e-05, "loss": 1.0737, "step": 1475 }, { "epoch": 0.07392607392607392, "grad_norm": 2.385080575942993, "learning_rate": 9.450022936948877e-05, "loss": 1.1252, "step": 1480 }, { "epoch": 0.07417582417582418, "grad_norm": 1.6133877038955688, "learning_rate": 9.447474387073755e-05, "loss": 1.012, "step": 1485 }, { "epoch": 0.07442557442557443, "grad_norm": 1.976572871208191, "learning_rate": 9.444925837198635e-05, "loss": 1.0457, "step": 1490 }, { "epoch": 0.07467532467532467, "grad_norm": 2.263914108276367, "learning_rate": 9.442377287323514e-05, "loss": 1.1192, "step": 1495 }, { "epoch": 0.07492507492507493, "grad_norm": 2.1266672611236572, "learning_rate": 9.439828737448392e-05, "loss": 1.101, "step": 1500 }, { "epoch": 0.07517482517482517, "grad_norm": 2.593883991241455, "learning_rate": 9.437280187573271e-05, "loss": 1.0824, "step": 1505 }, { "epoch": 0.07542457542457542, "grad_norm": 2.673349618911743, "learning_rate": 9.43473163769815e-05, "loss": 0.965, "step": 1510 }, { "epoch": 0.07567432567432568, "grad_norm": 2.3759806156158447, "learning_rate": 9.432183087823029e-05, "loss": 1.0273, "step": 1515 }, { "epoch": 0.07592407592407592, "grad_norm": 1.5420026779174805, "learning_rate": 9.429634537947908e-05, "loss": 1.1288, "step": 1520 }, { "epoch": 0.07617382617382618, "grad_norm": 1.9897898435592651, "learning_rate": 9.427085988072786e-05, "loss": 1.0961, "step": 1525 }, { "epoch": 0.07642357642357642, "grad_norm": 1.6935060024261475, "learning_rate": 9.424537438197666e-05, "loss": 1.0206, "step": 1530 }, { "epoch": 0.07667332667332667, "grad_norm": 1.9373195171356201, "learning_rate": 9.421988888322545e-05, "loss": 1.149, "step": 1535 }, { "epoch": 0.07692307692307693, "grad_norm": 1.7390213012695312, "learning_rate": 9.419440338447423e-05, "loss": 1.0844, "step": 1540 }, { "epoch": 0.07717282717282717, "grad_norm": 2.339433193206787, "learning_rate": 9.416891788572303e-05, "loss": 1.0654, "step": 1545 }, { "epoch": 0.07742257742257742, "grad_norm": 1.8696469068527222, "learning_rate": 9.414343238697182e-05, "loss": 1.0267, "step": 1550 }, { "epoch": 0.07767232767232768, "grad_norm": 1.9753731489181519, "learning_rate": 9.41179468882206e-05, "loss": 0.9779, "step": 1555 }, { "epoch": 0.07792207792207792, "grad_norm": 2.466909885406494, "learning_rate": 9.40924613894694e-05, "loss": 1.0785, "step": 1560 }, { "epoch": 0.07817182817182818, "grad_norm": 2.188283920288086, "learning_rate": 9.406697589071819e-05, "loss": 1.0192, "step": 1565 }, { "epoch": 0.07842157842157842, "grad_norm": 1.973293423652649, "learning_rate": 9.404149039196697e-05, "loss": 1.0632, "step": 1570 }, { "epoch": 0.07867132867132867, "grad_norm": 1.6519509553909302, "learning_rate": 9.401600489321577e-05, "loss": 1.085, "step": 1575 }, { "epoch": 0.07892107892107893, "grad_norm": 2.235830783843994, "learning_rate": 9.399051939446454e-05, "loss": 1.0508, "step": 1580 }, { "epoch": 0.07917082917082917, "grad_norm": 1.8035004138946533, "learning_rate": 9.396503389571334e-05, "loss": 1.009, "step": 1585 }, { "epoch": 0.07942057942057942, "grad_norm": 2.4992175102233887, "learning_rate": 9.393954839696213e-05, "loss": 1.0369, "step": 1590 }, { "epoch": 0.07967032967032966, "grad_norm": 2.039412498474121, "learning_rate": 9.391406289821091e-05, "loss": 1.1572, "step": 1595 }, { "epoch": 0.07992007992007992, "grad_norm": 1.5818575620651245, "learning_rate": 9.388857739945971e-05, "loss": 1.0685, "step": 1600 }, { "epoch": 0.08016983016983018, "grad_norm": 1.9173918962478638, "learning_rate": 9.38630919007085e-05, "loss": 1.0972, "step": 1605 }, { "epoch": 0.08041958041958042, "grad_norm": 1.5649491548538208, "learning_rate": 9.383760640195728e-05, "loss": 1.1117, "step": 1610 }, { "epoch": 0.08066933066933067, "grad_norm": 1.98507559299469, "learning_rate": 9.381212090320608e-05, "loss": 1.1697, "step": 1615 }, { "epoch": 0.08091908091908091, "grad_norm": 2.137964963912964, "learning_rate": 9.378663540445487e-05, "loss": 1.0668, "step": 1620 }, { "epoch": 0.08116883116883117, "grad_norm": 1.862763524055481, "learning_rate": 9.376114990570367e-05, "loss": 1.0785, "step": 1625 }, { "epoch": 0.08141858141858142, "grad_norm": 3.6208596229553223, "learning_rate": 9.373566440695245e-05, "loss": 1.0707, "step": 1630 }, { "epoch": 0.08166833166833166, "grad_norm": 1.8828054666519165, "learning_rate": 9.371017890820124e-05, "loss": 1.0225, "step": 1635 }, { "epoch": 0.08191808191808192, "grad_norm": 1.9555052518844604, "learning_rate": 9.368469340945004e-05, "loss": 1.0905, "step": 1640 }, { "epoch": 0.08216783216783216, "grad_norm": 2.04486346244812, "learning_rate": 9.365920791069882e-05, "loss": 1.0368, "step": 1645 }, { "epoch": 0.08241758241758242, "grad_norm": 2.1272482872009277, "learning_rate": 9.363372241194761e-05, "loss": 1.1441, "step": 1650 }, { "epoch": 0.08266733266733267, "grad_norm": 1.9807994365692139, "learning_rate": 9.36082369131964e-05, "loss": 1.0914, "step": 1655 }, { "epoch": 0.08291708291708291, "grad_norm": 1.9267189502716064, "learning_rate": 9.358275141444518e-05, "loss": 1.1307, "step": 1660 }, { "epoch": 0.08316683316683317, "grad_norm": 2.2743327617645264, "learning_rate": 9.355726591569396e-05, "loss": 0.9578, "step": 1665 }, { "epoch": 0.08341658341658342, "grad_norm": 2.1046268939971924, "learning_rate": 9.353178041694276e-05, "loss": 1.0832, "step": 1670 }, { "epoch": 0.08366633366633366, "grad_norm": 1.8345096111297607, "learning_rate": 9.350629491819155e-05, "loss": 1.0514, "step": 1675 }, { "epoch": 0.08391608391608392, "grad_norm": 2.2238266468048096, "learning_rate": 9.348080941944035e-05, "loss": 0.9749, "step": 1680 }, { "epoch": 0.08416583416583416, "grad_norm": 1.7355965375900269, "learning_rate": 9.345532392068913e-05, "loss": 1.0563, "step": 1685 }, { "epoch": 0.08441558441558442, "grad_norm": 1.7400983572006226, "learning_rate": 9.342983842193792e-05, "loss": 1.0381, "step": 1690 }, { "epoch": 0.08466533466533467, "grad_norm": 1.6807479858398438, "learning_rate": 9.340435292318672e-05, "loss": 1.0874, "step": 1695 }, { "epoch": 0.08491508491508491, "grad_norm": 2.5074424743652344, "learning_rate": 9.33788674244355e-05, "loss": 1.0522, "step": 1700 }, { "epoch": 0.08516483516483517, "grad_norm": 1.9145509004592896, "learning_rate": 9.335338192568429e-05, "loss": 1.1154, "step": 1705 }, { "epoch": 0.08541458541458541, "grad_norm": 2.057141065597534, "learning_rate": 9.332789642693309e-05, "loss": 1.0421, "step": 1710 }, { "epoch": 0.08566433566433566, "grad_norm": 1.6423461437225342, "learning_rate": 9.330241092818187e-05, "loss": 1.0792, "step": 1715 }, { "epoch": 0.08591408591408592, "grad_norm": 2.4497835636138916, "learning_rate": 9.327692542943066e-05, "loss": 1.0508, "step": 1720 }, { "epoch": 0.08616383616383616, "grad_norm": 2.2422215938568115, "learning_rate": 9.325143993067946e-05, "loss": 0.9341, "step": 1725 }, { "epoch": 0.08641358641358642, "grad_norm": 3.395003080368042, "learning_rate": 9.322595443192824e-05, "loss": 1.1383, "step": 1730 }, { "epoch": 0.08666333666333666, "grad_norm": 2.15932559967041, "learning_rate": 9.320046893317703e-05, "loss": 0.9627, "step": 1735 }, { "epoch": 0.08691308691308691, "grad_norm": 2.202414035797119, "learning_rate": 9.317498343442581e-05, "loss": 1.1128, "step": 1740 }, { "epoch": 0.08716283716283717, "grad_norm": 1.9424654245376587, "learning_rate": 9.31494979356746e-05, "loss": 1.0885, "step": 1745 }, { "epoch": 0.08741258741258741, "grad_norm": 2.5046703815460205, "learning_rate": 9.31240124369234e-05, "loss": 0.9815, "step": 1750 }, { "epoch": 0.08766233766233766, "grad_norm": 2.7471020221710205, "learning_rate": 9.309852693817218e-05, "loss": 0.9854, "step": 1755 }, { "epoch": 0.08791208791208792, "grad_norm": 1.8473713397979736, "learning_rate": 9.307304143942097e-05, "loss": 1.0858, "step": 1760 }, { "epoch": 0.08816183816183816, "grad_norm": 2.0180139541625977, "learning_rate": 9.304755594066977e-05, "loss": 0.9825, "step": 1765 }, { "epoch": 0.08841158841158842, "grad_norm": 1.7164231538772583, "learning_rate": 9.302207044191855e-05, "loss": 0.9741, "step": 1770 }, { "epoch": 0.08866133866133866, "grad_norm": 1.9583089351654053, "learning_rate": 9.299658494316734e-05, "loss": 1.0438, "step": 1775 }, { "epoch": 0.08891108891108891, "grad_norm": 1.726304292678833, "learning_rate": 9.297109944441614e-05, "loss": 1.0664, "step": 1780 }, { "epoch": 0.08916083916083917, "grad_norm": 2.1803619861602783, "learning_rate": 9.294561394566492e-05, "loss": 0.9813, "step": 1785 }, { "epoch": 0.08941058941058941, "grad_norm": 1.594977617263794, "learning_rate": 9.292012844691371e-05, "loss": 1.1342, "step": 1790 }, { "epoch": 0.08966033966033966, "grad_norm": 2.358466386795044, "learning_rate": 9.28946429481625e-05, "loss": 1.1309, "step": 1795 }, { "epoch": 0.0899100899100899, "grad_norm": 1.7947657108306885, "learning_rate": 9.286915744941129e-05, "loss": 1.0647, "step": 1800 }, { "epoch": 0.09015984015984016, "grad_norm": 4.265115737915039, "learning_rate": 9.284367195066008e-05, "loss": 0.9946, "step": 1805 }, { "epoch": 0.09040959040959042, "grad_norm": 2.645778179168701, "learning_rate": 9.281818645190888e-05, "loss": 1.0288, "step": 1810 }, { "epoch": 0.09065934065934066, "grad_norm": 2.1937143802642822, "learning_rate": 9.279270095315766e-05, "loss": 1.0036, "step": 1815 }, { "epoch": 0.09090909090909091, "grad_norm": 1.9165890216827393, "learning_rate": 9.276721545440645e-05, "loss": 1.0864, "step": 1820 }, { "epoch": 0.09115884115884115, "grad_norm": 2.686234712600708, "learning_rate": 9.274172995565523e-05, "loss": 1.0925, "step": 1825 }, { "epoch": 0.09140859140859141, "grad_norm": 1.8842302560806274, "learning_rate": 9.271624445690402e-05, "loss": 1.1006, "step": 1830 }, { "epoch": 0.09165834165834166, "grad_norm": 1.7551827430725098, "learning_rate": 9.269075895815282e-05, "loss": 1.0847, "step": 1835 }, { "epoch": 0.0919080919080919, "grad_norm": 1.789575219154358, "learning_rate": 9.26652734594016e-05, "loss": 1.1, "step": 1840 }, { "epoch": 0.09215784215784216, "grad_norm": 1.6560639142990112, "learning_rate": 9.263978796065039e-05, "loss": 1.0926, "step": 1845 }, { "epoch": 0.0924075924075924, "grad_norm": 2.4609718322753906, "learning_rate": 9.261430246189919e-05, "loss": 1.0652, "step": 1850 }, { "epoch": 0.09265734265734266, "grad_norm": 2.815316915512085, "learning_rate": 9.258881696314797e-05, "loss": 0.9297, "step": 1855 }, { "epoch": 0.09290709290709291, "grad_norm": 1.9317278861999512, "learning_rate": 9.256333146439676e-05, "loss": 1.0361, "step": 1860 }, { "epoch": 0.09315684315684315, "grad_norm": 1.8428795337677002, "learning_rate": 9.253784596564556e-05, "loss": 1.0118, "step": 1865 }, { "epoch": 0.09340659340659341, "grad_norm": 1.8035210371017456, "learning_rate": 9.251236046689434e-05, "loss": 1.0756, "step": 1870 }, { "epoch": 0.09365634365634366, "grad_norm": 1.7311344146728516, "learning_rate": 9.248687496814313e-05, "loss": 1.1516, "step": 1875 }, { "epoch": 0.0939060939060939, "grad_norm": 2.3379533290863037, "learning_rate": 9.246138946939193e-05, "loss": 1.1268, "step": 1880 }, { "epoch": 0.09415584415584416, "grad_norm": 2.288144588470459, "learning_rate": 9.243590397064071e-05, "loss": 1.0325, "step": 1885 }, { "epoch": 0.0944055944055944, "grad_norm": 1.7460414171218872, "learning_rate": 9.241041847188951e-05, "loss": 1.0785, "step": 1890 }, { "epoch": 0.09465534465534466, "grad_norm": 2.111440896987915, "learning_rate": 9.23849329731383e-05, "loss": 1.0625, "step": 1895 }, { "epoch": 0.09490509490509491, "grad_norm": 1.8377785682678223, "learning_rate": 9.235944747438707e-05, "loss": 1.0417, "step": 1900 }, { "epoch": 0.09515484515484515, "grad_norm": 1.8210581541061401, "learning_rate": 9.233396197563587e-05, "loss": 1.0947, "step": 1905 }, { "epoch": 0.09540459540459541, "grad_norm": 2.269029378890991, "learning_rate": 9.230847647688465e-05, "loss": 0.9578, "step": 1910 }, { "epoch": 0.09565434565434565, "grad_norm": 1.8923817873001099, "learning_rate": 9.228299097813344e-05, "loss": 1.0312, "step": 1915 }, { "epoch": 0.0959040959040959, "grad_norm": 3.1870830059051514, "learning_rate": 9.225750547938224e-05, "loss": 0.9771, "step": 1920 }, { "epoch": 0.09615384615384616, "grad_norm": 2.1532297134399414, "learning_rate": 9.223201998063102e-05, "loss": 1.1079, "step": 1925 }, { "epoch": 0.0964035964035964, "grad_norm": 1.9820715188980103, "learning_rate": 9.220653448187982e-05, "loss": 1.1108, "step": 1930 }, { "epoch": 0.09665334665334666, "grad_norm": 2.5562918186187744, "learning_rate": 9.21810489831286e-05, "loss": 1.1529, "step": 1935 }, { "epoch": 0.0969030969030969, "grad_norm": 2.216808319091797, "learning_rate": 9.215556348437739e-05, "loss": 1.052, "step": 1940 }, { "epoch": 0.09715284715284715, "grad_norm": 1.8855540752410889, "learning_rate": 9.213007798562619e-05, "loss": 1.1045, "step": 1945 }, { "epoch": 0.09740259740259741, "grad_norm": 2.4225239753723145, "learning_rate": 9.210459248687498e-05, "loss": 1.0827, "step": 1950 }, { "epoch": 0.09765234765234765, "grad_norm": 2.164170503616333, "learning_rate": 9.207910698812376e-05, "loss": 1.1598, "step": 1955 }, { "epoch": 0.0979020979020979, "grad_norm": 1.6184277534484863, "learning_rate": 9.205362148937256e-05, "loss": 1.0059, "step": 1960 }, { "epoch": 0.09815184815184815, "grad_norm": 1.4765101671218872, "learning_rate": 9.202813599062135e-05, "loss": 1.1232, "step": 1965 }, { "epoch": 0.0984015984015984, "grad_norm": 1.805996298789978, "learning_rate": 9.200265049187013e-05, "loss": 1.039, "step": 1970 }, { "epoch": 0.09865134865134866, "grad_norm": 2.1366546154022217, "learning_rate": 9.197716499311893e-05, "loss": 1.0547, "step": 1975 }, { "epoch": 0.0989010989010989, "grad_norm": 1.7093604803085327, "learning_rate": 9.19516794943677e-05, "loss": 0.9783, "step": 1980 }, { "epoch": 0.09915084915084915, "grad_norm": 2.013899803161621, "learning_rate": 9.19261939956165e-05, "loss": 1.0716, "step": 1985 }, { "epoch": 0.09940059940059941, "grad_norm": 1.6569393873214722, "learning_rate": 9.190070849686529e-05, "loss": 1.0557, "step": 1990 }, { "epoch": 0.09965034965034965, "grad_norm": 1.436673879623413, "learning_rate": 9.187522299811407e-05, "loss": 1.0273, "step": 1995 }, { "epoch": 0.0999000999000999, "grad_norm": 1.6481308937072754, "learning_rate": 9.184973749936287e-05, "loss": 1.0199, "step": 2000 }, { "epoch": 0.10014985014985014, "grad_norm": 2.1242127418518066, "learning_rate": 9.182425200061166e-05, "loss": 1.0621, "step": 2005 }, { "epoch": 0.1003996003996004, "grad_norm": 1.708878993988037, "learning_rate": 9.179876650186044e-05, "loss": 1.0372, "step": 2010 }, { "epoch": 0.10064935064935066, "grad_norm": 2.1060447692871094, "learning_rate": 9.177328100310924e-05, "loss": 1.0382, "step": 2015 }, { "epoch": 0.1008991008991009, "grad_norm": 1.8815819025039673, "learning_rate": 9.174779550435803e-05, "loss": 1.1084, "step": 2020 }, { "epoch": 0.10114885114885115, "grad_norm": 2.150170087814331, "learning_rate": 9.172231000560681e-05, "loss": 0.9927, "step": 2025 }, { "epoch": 0.10139860139860139, "grad_norm": 2.160966396331787, "learning_rate": 9.169682450685561e-05, "loss": 0.9507, "step": 2030 }, { "epoch": 0.10164835164835165, "grad_norm": 1.7299655675888062, "learning_rate": 9.16713390081044e-05, "loss": 1.1534, "step": 2035 }, { "epoch": 0.1018981018981019, "grad_norm": 1.925952672958374, "learning_rate": 9.164585350935318e-05, "loss": 1.0709, "step": 2040 }, { "epoch": 0.10214785214785214, "grad_norm": 3.60642409324646, "learning_rate": 9.162036801060198e-05, "loss": 1.0507, "step": 2045 }, { "epoch": 0.1023976023976024, "grad_norm": 2.143662929534912, "learning_rate": 9.159488251185077e-05, "loss": 1.084, "step": 2050 }, { "epoch": 0.10264735264735264, "grad_norm": 2.546248197555542, "learning_rate": 9.156939701309955e-05, "loss": 1.1297, "step": 2055 }, { "epoch": 0.1028971028971029, "grad_norm": 2.100470542907715, "learning_rate": 9.154391151434834e-05, "loss": 1.021, "step": 2060 }, { "epoch": 0.10314685314685315, "grad_norm": 2.970125675201416, "learning_rate": 9.151842601559712e-05, "loss": 1.02, "step": 2065 }, { "epoch": 0.10339660339660339, "grad_norm": 1.652052640914917, "learning_rate": 9.149294051684592e-05, "loss": 1.1092, "step": 2070 }, { "epoch": 0.10364635364635365, "grad_norm": 1.6848113536834717, "learning_rate": 9.14674550180947e-05, "loss": 1.0742, "step": 2075 }, { "epoch": 0.1038961038961039, "grad_norm": 2.1029481887817383, "learning_rate": 9.144196951934349e-05, "loss": 1.0568, "step": 2080 }, { "epoch": 0.10414585414585414, "grad_norm": 2.3390204906463623, "learning_rate": 9.141648402059229e-05, "loss": 1.0175, "step": 2085 }, { "epoch": 0.1043956043956044, "grad_norm": 1.8294017314910889, "learning_rate": 9.139099852184108e-05, "loss": 1.0326, "step": 2090 }, { "epoch": 0.10464535464535464, "grad_norm": 1.8346604108810425, "learning_rate": 9.136551302308986e-05, "loss": 1.0781, "step": 2095 }, { "epoch": 0.1048951048951049, "grad_norm": 2.061452865600586, "learning_rate": 9.134002752433866e-05, "loss": 1.007, "step": 2100 }, { "epoch": 0.10514485514485515, "grad_norm": 2.686570405960083, "learning_rate": 9.131454202558745e-05, "loss": 1.0743, "step": 2105 }, { "epoch": 0.10539460539460539, "grad_norm": 3.0611164569854736, "learning_rate": 9.128905652683623e-05, "loss": 1.0798, "step": 2110 }, { "epoch": 0.10564435564435565, "grad_norm": 2.021026849746704, "learning_rate": 9.126357102808503e-05, "loss": 1.0732, "step": 2115 }, { "epoch": 0.10589410589410589, "grad_norm": 1.7216160297393799, "learning_rate": 9.123808552933381e-05, "loss": 1.097, "step": 2120 }, { "epoch": 0.10614385614385614, "grad_norm": 1.6146548986434937, "learning_rate": 9.12126000305826e-05, "loss": 0.9365, "step": 2125 }, { "epoch": 0.1063936063936064, "grad_norm": 1.6868575811386108, "learning_rate": 9.11871145318314e-05, "loss": 1.0741, "step": 2130 }, { "epoch": 0.10664335664335664, "grad_norm": 1.4694464206695557, "learning_rate": 9.116162903308018e-05, "loss": 1.0066, "step": 2135 }, { "epoch": 0.1068931068931069, "grad_norm": 1.4769980907440186, "learning_rate": 9.113614353432897e-05, "loss": 1.1144, "step": 2140 }, { "epoch": 0.10714285714285714, "grad_norm": 1.7758394479751587, "learning_rate": 9.111065803557776e-05, "loss": 1.0958, "step": 2145 }, { "epoch": 0.10739260739260739, "grad_norm": 1.6722019910812378, "learning_rate": 9.108517253682654e-05, "loss": 1.1063, "step": 2150 }, { "epoch": 0.10764235764235765, "grad_norm": 2.069653034210205, "learning_rate": 9.105968703807534e-05, "loss": 1.0336, "step": 2155 }, { "epoch": 0.10789210789210789, "grad_norm": 1.3344229459762573, "learning_rate": 9.103420153932413e-05, "loss": 0.9972, "step": 2160 }, { "epoch": 0.10814185814185814, "grad_norm": 2.9084346294403076, "learning_rate": 9.100871604057291e-05, "loss": 0.9695, "step": 2165 }, { "epoch": 0.10839160839160839, "grad_norm": 1.7706812620162964, "learning_rate": 9.098323054182171e-05, "loss": 1.0787, "step": 2170 }, { "epoch": 0.10864135864135864, "grad_norm": 2.1064703464508057, "learning_rate": 9.09577450430705e-05, "loss": 1.0151, "step": 2175 }, { "epoch": 0.1088911088911089, "grad_norm": 1.4901058673858643, "learning_rate": 9.093225954431928e-05, "loss": 0.9226, "step": 2180 }, { "epoch": 0.10914085914085914, "grad_norm": 1.7164499759674072, "learning_rate": 9.090677404556808e-05, "loss": 1.0053, "step": 2185 }, { "epoch": 0.10939060939060939, "grad_norm": 1.573237419128418, "learning_rate": 9.088128854681686e-05, "loss": 1.1322, "step": 2190 }, { "epoch": 0.10964035964035965, "grad_norm": 1.654096245765686, "learning_rate": 9.085580304806566e-05, "loss": 1.132, "step": 2195 }, { "epoch": 0.10989010989010989, "grad_norm": 1.7190895080566406, "learning_rate": 9.083031754931445e-05, "loss": 0.9697, "step": 2200 }, { "epoch": 0.11013986013986014, "grad_norm": 2.212390422821045, "learning_rate": 9.080483205056323e-05, "loss": 1.1021, "step": 2205 }, { "epoch": 0.11038961038961038, "grad_norm": 1.8313217163085938, "learning_rate": 9.077934655181203e-05, "loss": 1.0916, "step": 2210 }, { "epoch": 0.11063936063936064, "grad_norm": 1.687462329864502, "learning_rate": 9.075386105306082e-05, "loss": 0.9823, "step": 2215 }, { "epoch": 0.1108891108891109, "grad_norm": 2.377135753631592, "learning_rate": 9.072837555430959e-05, "loss": 1.0509, "step": 2220 }, { "epoch": 0.11113886113886114, "grad_norm": 2.4240972995758057, "learning_rate": 9.070289005555839e-05, "loss": 1.0009, "step": 2225 }, { "epoch": 0.11138861138861139, "grad_norm": 1.7449638843536377, "learning_rate": 9.067740455680718e-05, "loss": 1.0782, "step": 2230 }, { "epoch": 0.11163836163836163, "grad_norm": 2.1716010570526123, "learning_rate": 9.065191905805597e-05, "loss": 1.0774, "step": 2235 }, { "epoch": 0.11188811188811189, "grad_norm": 1.5327948331832886, "learning_rate": 9.062643355930476e-05, "loss": 1.092, "step": 2240 }, { "epoch": 0.11213786213786214, "grad_norm": 2.833946943283081, "learning_rate": 9.060094806055354e-05, "loss": 1.0812, "step": 2245 }, { "epoch": 0.11238761238761238, "grad_norm": 2.0088608264923096, "learning_rate": 9.057546256180234e-05, "loss": 0.9516, "step": 2250 }, { "epoch": 0.11263736263736264, "grad_norm": 2.4475722312927246, "learning_rate": 9.054997706305113e-05, "loss": 0.9926, "step": 2255 }, { "epoch": 0.11288711288711288, "grad_norm": 3.076702833175659, "learning_rate": 9.052449156429991e-05, "loss": 1.1284, "step": 2260 }, { "epoch": 0.11313686313686314, "grad_norm": 2.1518921852111816, "learning_rate": 9.049900606554871e-05, "loss": 1.0359, "step": 2265 }, { "epoch": 0.11338661338661339, "grad_norm": 2.1394128799438477, "learning_rate": 9.04735205667975e-05, "loss": 1.0441, "step": 2270 }, { "epoch": 0.11363636363636363, "grad_norm": 1.8072049617767334, "learning_rate": 9.044803506804628e-05, "loss": 0.9534, "step": 2275 }, { "epoch": 0.11388611388611389, "grad_norm": 1.9882434606552124, "learning_rate": 9.042254956929508e-05, "loss": 1.0553, "step": 2280 }, { "epoch": 0.11413586413586413, "grad_norm": 1.7352203130722046, "learning_rate": 9.039706407054387e-05, "loss": 1.0134, "step": 2285 }, { "epoch": 0.11438561438561438, "grad_norm": 2.043804168701172, "learning_rate": 9.037157857179265e-05, "loss": 1.0349, "step": 2290 }, { "epoch": 0.11463536463536464, "grad_norm": 2.6210641860961914, "learning_rate": 9.034609307304145e-05, "loss": 1.0713, "step": 2295 }, { "epoch": 0.11488511488511488, "grad_norm": 2.2421951293945312, "learning_rate": 9.032060757429023e-05, "loss": 1.0259, "step": 2300 }, { "epoch": 0.11513486513486514, "grad_norm": 1.8432635068893433, "learning_rate": 9.029512207553902e-05, "loss": 1.0839, "step": 2305 }, { "epoch": 0.11538461538461539, "grad_norm": 1.5793393850326538, "learning_rate": 9.026963657678781e-05, "loss": 1.0082, "step": 2310 }, { "epoch": 0.11563436563436563, "grad_norm": 1.67103910446167, "learning_rate": 9.02441510780366e-05, "loss": 1.0225, "step": 2315 }, { "epoch": 0.11588411588411589, "grad_norm": 2.022448778152466, "learning_rate": 9.02186655792854e-05, "loss": 1.0477, "step": 2320 }, { "epoch": 0.11613386613386613, "grad_norm": 1.6987947225570679, "learning_rate": 9.019318008053418e-05, "loss": 0.8845, "step": 2325 }, { "epoch": 0.11638361638361638, "grad_norm": 2.0741217136383057, "learning_rate": 9.016769458178296e-05, "loss": 1.0881, "step": 2330 }, { "epoch": 0.11663336663336664, "grad_norm": 2.3251655101776123, "learning_rate": 9.014220908303176e-05, "loss": 1.0189, "step": 2335 }, { "epoch": 0.11688311688311688, "grad_norm": 1.7478495836257935, "learning_rate": 9.011672358428055e-05, "loss": 1.0308, "step": 2340 }, { "epoch": 0.11713286713286714, "grad_norm": 2.3027474880218506, "learning_rate": 9.009123808552933e-05, "loss": 1.0464, "step": 2345 }, { "epoch": 0.11738261738261738, "grad_norm": 2.1689608097076416, "learning_rate": 9.006575258677813e-05, "loss": 1.2171, "step": 2350 }, { "epoch": 0.11763236763236763, "grad_norm": 1.52338707447052, "learning_rate": 9.004026708802692e-05, "loss": 1.0851, "step": 2355 }, { "epoch": 0.11788211788211789, "grad_norm": 1.8425296545028687, "learning_rate": 9.00147815892757e-05, "loss": 1.0639, "step": 2360 }, { "epoch": 0.11813186813186813, "grad_norm": 2.3348381519317627, "learning_rate": 8.99892960905245e-05, "loss": 1.0297, "step": 2365 }, { "epoch": 0.11838161838161838, "grad_norm": 1.6651554107666016, "learning_rate": 8.996381059177329e-05, "loss": 1.1615, "step": 2370 }, { "epoch": 0.11863136863136862, "grad_norm": 1.8528300523757935, "learning_rate": 8.993832509302207e-05, "loss": 1.017, "step": 2375 }, { "epoch": 0.11888111888111888, "grad_norm": 1.7702510356903076, "learning_rate": 8.991283959427086e-05, "loss": 1.0625, "step": 2380 }, { "epoch": 0.11913086913086914, "grad_norm": 1.7936981916427612, "learning_rate": 8.988735409551964e-05, "loss": 1.0053, "step": 2385 }, { "epoch": 0.11938061938061938, "grad_norm": 1.768790364265442, "learning_rate": 8.986186859676844e-05, "loss": 1.013, "step": 2390 }, { "epoch": 0.11963036963036963, "grad_norm": 2.1831345558166504, "learning_rate": 8.983638309801723e-05, "loss": 1.0682, "step": 2395 }, { "epoch": 0.11988011988011989, "grad_norm": 2.0166027545928955, "learning_rate": 8.981089759926601e-05, "loss": 1.0797, "step": 2400 }, { "epoch": 0.12012987012987013, "grad_norm": 2.357877254486084, "learning_rate": 8.978541210051481e-05, "loss": 1.0431, "step": 2405 }, { "epoch": 0.12037962037962038, "grad_norm": 2.2804207801818848, "learning_rate": 8.97599266017636e-05, "loss": 1.0429, "step": 2410 }, { "epoch": 0.12062937062937062, "grad_norm": 1.9239094257354736, "learning_rate": 8.973444110301238e-05, "loss": 0.941, "step": 2415 }, { "epoch": 0.12087912087912088, "grad_norm": 2.4048569202423096, "learning_rate": 8.970895560426118e-05, "loss": 1.0453, "step": 2420 }, { "epoch": 0.12112887112887114, "grad_norm": 1.949598789215088, "learning_rate": 8.968347010550997e-05, "loss": 1.0354, "step": 2425 }, { "epoch": 0.12137862137862138, "grad_norm": 1.8157947063446045, "learning_rate": 8.965798460675875e-05, "loss": 1.1299, "step": 2430 }, { "epoch": 0.12162837162837163, "grad_norm": 1.656416893005371, "learning_rate": 8.963249910800755e-05, "loss": 1.1187, "step": 2435 }, { "epoch": 0.12187812187812187, "grad_norm": 2.1745493412017822, "learning_rate": 8.960701360925634e-05, "loss": 1.1126, "step": 2440 }, { "epoch": 0.12212787212787213, "grad_norm": 1.9854999780654907, "learning_rate": 8.958152811050514e-05, "loss": 1.1295, "step": 2445 }, { "epoch": 0.12237762237762238, "grad_norm": 1.7837671041488647, "learning_rate": 8.955604261175392e-05, "loss": 1.0599, "step": 2450 }, { "epoch": 0.12262737262737262, "grad_norm": 2.2991738319396973, "learning_rate": 8.953055711300271e-05, "loss": 1.0984, "step": 2455 }, { "epoch": 0.12287712287712288, "grad_norm": 2.5107836723327637, "learning_rate": 8.95050716142515e-05, "loss": 1.1367, "step": 2460 }, { "epoch": 0.12312687312687312, "grad_norm": 1.7681961059570312, "learning_rate": 8.947958611550028e-05, "loss": 1.0497, "step": 2465 }, { "epoch": 0.12337662337662338, "grad_norm": 2.529489278793335, "learning_rate": 8.945410061674906e-05, "loss": 1.0309, "step": 2470 }, { "epoch": 0.12362637362637363, "grad_norm": 2.0825414657592773, "learning_rate": 8.942861511799786e-05, "loss": 1.1992, "step": 2475 }, { "epoch": 0.12387612387612387, "grad_norm": 1.7368685007095337, "learning_rate": 8.940312961924665e-05, "loss": 1.064, "step": 2480 }, { "epoch": 0.12412587412587413, "grad_norm": 1.7694673538208008, "learning_rate": 8.937764412049543e-05, "loss": 1.0476, "step": 2485 }, { "epoch": 0.12437562437562437, "grad_norm": 1.785753846168518, "learning_rate": 8.935215862174423e-05, "loss": 1.0233, "step": 2490 }, { "epoch": 0.12462537462537462, "grad_norm": 1.7884680032730103, "learning_rate": 8.932667312299302e-05, "loss": 1.0426, "step": 2495 }, { "epoch": 0.12487512487512488, "grad_norm": 1.5876563787460327, "learning_rate": 8.930118762424182e-05, "loss": 1.0525, "step": 2500 }, { "epoch": 0.12512487512487513, "grad_norm": 1.277575135231018, "learning_rate": 8.92757021254906e-05, "loss": 1.0585, "step": 2505 }, { "epoch": 0.12537462537462538, "grad_norm": 1.6417629718780518, "learning_rate": 8.925021662673939e-05, "loss": 1.0592, "step": 2510 }, { "epoch": 0.12562437562437562, "grad_norm": 1.8105955123901367, "learning_rate": 8.922473112798819e-05, "loss": 0.9967, "step": 2515 }, { "epoch": 0.1258741258741259, "grad_norm": 1.8297070264816284, "learning_rate": 8.919924562923697e-05, "loss": 0.9621, "step": 2520 }, { "epoch": 0.12612387612387613, "grad_norm": 2.120720863342285, "learning_rate": 8.917376013048576e-05, "loss": 0.9339, "step": 2525 }, { "epoch": 0.12637362637362637, "grad_norm": 2.2692010402679443, "learning_rate": 8.914827463173456e-05, "loss": 0.9722, "step": 2530 }, { "epoch": 0.1266233766233766, "grad_norm": 4.11220121383667, "learning_rate": 8.912278913298334e-05, "loss": 0.9596, "step": 2535 }, { "epoch": 0.12687312687312688, "grad_norm": 1.6969612836837769, "learning_rate": 8.909730363423211e-05, "loss": 1.1729, "step": 2540 }, { "epoch": 0.12712287712287712, "grad_norm": 2.033691883087158, "learning_rate": 8.907181813548091e-05, "loss": 1.074, "step": 2545 }, { "epoch": 0.12737262737262736, "grad_norm": 2.3797221183776855, "learning_rate": 8.90463326367297e-05, "loss": 0.9465, "step": 2550 }, { "epoch": 0.12762237762237763, "grad_norm": 2.2487850189208984, "learning_rate": 8.90208471379785e-05, "loss": 1.1333, "step": 2555 }, { "epoch": 0.12787212787212787, "grad_norm": 2.4816606044769287, "learning_rate": 8.899536163922728e-05, "loss": 1.0592, "step": 2560 }, { "epoch": 0.1281218781218781, "grad_norm": 2.238875389099121, "learning_rate": 8.896987614047607e-05, "loss": 1.1156, "step": 2565 }, { "epoch": 0.12837162837162838, "grad_norm": 1.8280402421951294, "learning_rate": 8.894439064172487e-05, "loss": 1.0287, "step": 2570 }, { "epoch": 0.12862137862137862, "grad_norm": 2.3221933841705322, "learning_rate": 8.891890514297365e-05, "loss": 1.0457, "step": 2575 }, { "epoch": 0.12887112887112886, "grad_norm": 1.958493947982788, "learning_rate": 8.889341964422244e-05, "loss": 1.1111, "step": 2580 }, { "epoch": 0.12912087912087913, "grad_norm": 2.529475688934326, "learning_rate": 8.886793414547124e-05, "loss": 1.0493, "step": 2585 }, { "epoch": 0.12937062937062938, "grad_norm": 2.1219065189361572, "learning_rate": 8.884244864672002e-05, "loss": 1.0908, "step": 2590 }, { "epoch": 0.12962037962037962, "grad_norm": 1.593130350112915, "learning_rate": 8.881696314796881e-05, "loss": 1.1234, "step": 2595 }, { "epoch": 0.12987012987012986, "grad_norm": 2.014354705810547, "learning_rate": 8.879147764921761e-05, "loss": 1.1519, "step": 2600 }, { "epoch": 0.13011988011988013, "grad_norm": 2.212662696838379, "learning_rate": 8.876599215046639e-05, "loss": 1.1193, "step": 2605 }, { "epoch": 0.13036963036963037, "grad_norm": 1.6654571294784546, "learning_rate": 8.874050665171518e-05, "loss": 1.1415, "step": 2610 }, { "epoch": 0.1306193806193806, "grad_norm": 1.9637036323547363, "learning_rate": 8.871502115296398e-05, "loss": 1.0034, "step": 2615 }, { "epoch": 0.13086913086913088, "grad_norm": 2.221522092819214, "learning_rate": 8.868953565421275e-05, "loss": 1.1609, "step": 2620 }, { "epoch": 0.13111888111888112, "grad_norm": 2.7778451442718506, "learning_rate": 8.866405015546155e-05, "loss": 1.1316, "step": 2625 }, { "epoch": 0.13136863136863136, "grad_norm": 1.4244577884674072, "learning_rate": 8.863856465671033e-05, "loss": 0.952, "step": 2630 }, { "epoch": 0.13161838161838163, "grad_norm": 2.081162452697754, "learning_rate": 8.861307915795912e-05, "loss": 1.0771, "step": 2635 }, { "epoch": 0.13186813186813187, "grad_norm": 1.7517297267913818, "learning_rate": 8.858759365920792e-05, "loss": 1.1374, "step": 2640 }, { "epoch": 0.1321178821178821, "grad_norm": 2.35196590423584, "learning_rate": 8.85621081604567e-05, "loss": 1.1061, "step": 2645 }, { "epoch": 0.13236763236763235, "grad_norm": 1.7148562669754028, "learning_rate": 8.853662266170549e-05, "loss": 1.0081, "step": 2650 }, { "epoch": 0.13261738261738262, "grad_norm": 3.011352300643921, "learning_rate": 8.851113716295429e-05, "loss": 1.0137, "step": 2655 }, { "epoch": 0.13286713286713286, "grad_norm": 1.5164105892181396, "learning_rate": 8.848565166420307e-05, "loss": 1.1004, "step": 2660 }, { "epoch": 0.1331168831168831, "grad_norm": 1.618575930595398, "learning_rate": 8.846016616545186e-05, "loss": 1.1281, "step": 2665 }, { "epoch": 0.13336663336663337, "grad_norm": 3.256931781768799, "learning_rate": 8.843468066670066e-05, "loss": 1.0754, "step": 2670 }, { "epoch": 0.13361638361638362, "grad_norm": 2.149198532104492, "learning_rate": 8.840919516794944e-05, "loss": 1.0766, "step": 2675 }, { "epoch": 0.13386613386613386, "grad_norm": 1.5928980112075806, "learning_rate": 8.838370966919823e-05, "loss": 1.1058, "step": 2680 }, { "epoch": 0.13411588411588413, "grad_norm": 1.8001689910888672, "learning_rate": 8.835822417044703e-05, "loss": 1.046, "step": 2685 }, { "epoch": 0.13436563436563437, "grad_norm": 1.9299262762069702, "learning_rate": 8.833273867169581e-05, "loss": 1.0363, "step": 2690 }, { "epoch": 0.1346153846153846, "grad_norm": 1.8792645931243896, "learning_rate": 8.83072531729446e-05, "loss": 1.0041, "step": 2695 }, { "epoch": 0.13486513486513488, "grad_norm": 2.6870203018188477, "learning_rate": 8.828176767419338e-05, "loss": 1.0356, "step": 2700 }, { "epoch": 0.13511488511488512, "grad_norm": 2.374284267425537, "learning_rate": 8.825628217544217e-05, "loss": 0.9648, "step": 2705 }, { "epoch": 0.13536463536463536, "grad_norm": 2.517435312271118, "learning_rate": 8.823079667669097e-05, "loss": 1.1937, "step": 2710 }, { "epoch": 0.1356143856143856, "grad_norm": 2.000870704650879, "learning_rate": 8.820531117793975e-05, "loss": 0.9251, "step": 2715 }, { "epoch": 0.13586413586413587, "grad_norm": 1.9407223463058472, "learning_rate": 8.817982567918854e-05, "loss": 1.0735, "step": 2720 }, { "epoch": 0.1361138861138861, "grad_norm": 2.781930446624756, "learning_rate": 8.815434018043734e-05, "loss": 1.1064, "step": 2725 }, { "epoch": 0.13636363636363635, "grad_norm": 2.186769723892212, "learning_rate": 8.812885468168612e-05, "loss": 1.1024, "step": 2730 }, { "epoch": 0.13661338661338662, "grad_norm": 1.7790948152542114, "learning_rate": 8.810336918293491e-05, "loss": 1.1093, "step": 2735 }, { "epoch": 0.13686313686313686, "grad_norm": 1.9105846881866455, "learning_rate": 8.80778836841837e-05, "loss": 0.9497, "step": 2740 }, { "epoch": 0.1371128871128871, "grad_norm": 1.7560501098632812, "learning_rate": 8.805239818543249e-05, "loss": 1.0822, "step": 2745 }, { "epoch": 0.13736263736263737, "grad_norm": 1.917834758758545, "learning_rate": 8.802691268668129e-05, "loss": 1.0975, "step": 2750 }, { "epoch": 0.13761238761238762, "grad_norm": 1.7206779718399048, "learning_rate": 8.800142718793008e-05, "loss": 1.1492, "step": 2755 }, { "epoch": 0.13786213786213786, "grad_norm": 1.899228811264038, "learning_rate": 8.797594168917886e-05, "loss": 1.0073, "step": 2760 }, { "epoch": 0.1381118881118881, "grad_norm": 1.9342011213302612, "learning_rate": 8.795045619042766e-05, "loss": 1.0585, "step": 2765 }, { "epoch": 0.13836163836163837, "grad_norm": 1.8506425619125366, "learning_rate": 8.792497069167645e-05, "loss": 1.1016, "step": 2770 }, { "epoch": 0.1386113886113886, "grad_norm": 1.8252501487731934, "learning_rate": 8.789948519292523e-05, "loss": 1.0115, "step": 2775 }, { "epoch": 0.13886113886113885, "grad_norm": 1.6893677711486816, "learning_rate": 8.787399969417402e-05, "loss": 1.1199, "step": 2780 }, { "epoch": 0.13911088911088912, "grad_norm": 1.6307785511016846, "learning_rate": 8.78485141954228e-05, "loss": 1.0743, "step": 2785 }, { "epoch": 0.13936063936063936, "grad_norm": 2.1995651721954346, "learning_rate": 8.782302869667159e-05, "loss": 1.0805, "step": 2790 }, { "epoch": 0.1396103896103896, "grad_norm": 1.708590030670166, "learning_rate": 8.779754319792039e-05, "loss": 1.1809, "step": 2795 }, { "epoch": 0.13986013986013987, "grad_norm": 1.6225452423095703, "learning_rate": 8.777205769916917e-05, "loss": 1.1492, "step": 2800 }, { "epoch": 0.1401098901098901, "grad_norm": 2.3506221771240234, "learning_rate": 8.774657220041797e-05, "loss": 1.0628, "step": 2805 }, { "epoch": 0.14035964035964035, "grad_norm": 1.8225237131118774, "learning_rate": 8.772108670166676e-05, "loss": 1.0145, "step": 2810 }, { "epoch": 0.14060939060939062, "grad_norm": 2.8418402671813965, "learning_rate": 8.769560120291554e-05, "loss": 0.9447, "step": 2815 }, { "epoch": 0.14085914085914086, "grad_norm": 2.1578736305236816, "learning_rate": 8.767011570416434e-05, "loss": 0.9401, "step": 2820 }, { "epoch": 0.1411088911088911, "grad_norm": 1.7758172750473022, "learning_rate": 8.764463020541313e-05, "loss": 1.0523, "step": 2825 }, { "epoch": 0.14135864135864135, "grad_norm": 2.8058080673217773, "learning_rate": 8.761914470666191e-05, "loss": 1.0037, "step": 2830 }, { "epoch": 0.14160839160839161, "grad_norm": 1.9823451042175293, "learning_rate": 8.759365920791071e-05, "loss": 1.0849, "step": 2835 }, { "epoch": 0.14185814185814186, "grad_norm": 3.396188497543335, "learning_rate": 8.75681737091595e-05, "loss": 1.0068, "step": 2840 }, { "epoch": 0.1421078921078921, "grad_norm": 2.2023849487304688, "learning_rate": 8.754268821040828e-05, "loss": 1.0014, "step": 2845 }, { "epoch": 0.14235764235764237, "grad_norm": 2.133798599243164, "learning_rate": 8.751720271165708e-05, "loss": 0.9958, "step": 2850 }, { "epoch": 0.1426073926073926, "grad_norm": 2.006645441055298, "learning_rate": 8.749171721290587e-05, "loss": 1.0519, "step": 2855 }, { "epoch": 0.14285714285714285, "grad_norm": 2.184204339981079, "learning_rate": 8.746623171415465e-05, "loss": 0.9837, "step": 2860 }, { "epoch": 0.14310689310689312, "grad_norm": 2.2189362049102783, "learning_rate": 8.744074621540344e-05, "loss": 1.0764, "step": 2865 }, { "epoch": 0.14335664335664336, "grad_norm": 2.521294593811035, "learning_rate": 8.741526071665222e-05, "loss": 1.1065, "step": 2870 }, { "epoch": 0.1436063936063936, "grad_norm": 2.337636709213257, "learning_rate": 8.738977521790102e-05, "loss": 0.9862, "step": 2875 }, { "epoch": 0.14385614385614387, "grad_norm": 2.058603286743164, "learning_rate": 8.73642897191498e-05, "loss": 1.1563, "step": 2880 }, { "epoch": 0.1441058941058941, "grad_norm": 2.5505788326263428, "learning_rate": 8.733880422039859e-05, "loss": 1.1343, "step": 2885 }, { "epoch": 0.14435564435564435, "grad_norm": 1.8435453176498413, "learning_rate": 8.731331872164739e-05, "loss": 1.0244, "step": 2890 }, { "epoch": 0.1446053946053946, "grad_norm": 1.9638776779174805, "learning_rate": 8.728783322289618e-05, "loss": 0.9861, "step": 2895 }, { "epoch": 0.14485514485514486, "grad_norm": 1.8834844827651978, "learning_rate": 8.726234772414496e-05, "loss": 1.1365, "step": 2900 }, { "epoch": 0.1451048951048951, "grad_norm": 2.475665330886841, "learning_rate": 8.723686222539376e-05, "loss": 1.0316, "step": 2905 }, { "epoch": 0.14535464535464535, "grad_norm": 1.7450743913650513, "learning_rate": 8.721137672664255e-05, "loss": 1.076, "step": 2910 }, { "epoch": 0.14560439560439561, "grad_norm": 1.9902225732803345, "learning_rate": 8.718589122789133e-05, "loss": 1.0939, "step": 2915 }, { "epoch": 0.14585414585414586, "grad_norm": 1.836561918258667, "learning_rate": 8.716040572914013e-05, "loss": 1.0681, "step": 2920 }, { "epoch": 0.1461038961038961, "grad_norm": 2.0990192890167236, "learning_rate": 8.713492023038892e-05, "loss": 0.9637, "step": 2925 }, { "epoch": 0.14635364635364637, "grad_norm": 1.6745598316192627, "learning_rate": 8.71094347316377e-05, "loss": 1.1055, "step": 2930 }, { "epoch": 0.1466033966033966, "grad_norm": 1.8660370111465454, "learning_rate": 8.70839492328865e-05, "loss": 1.0482, "step": 2935 }, { "epoch": 0.14685314685314685, "grad_norm": 2.8354482650756836, "learning_rate": 8.705846373413527e-05, "loss": 1.108, "step": 2940 }, { "epoch": 0.1471028971028971, "grad_norm": 2.0993285179138184, "learning_rate": 8.703297823538407e-05, "loss": 1.0826, "step": 2945 }, { "epoch": 0.14735264735264736, "grad_norm": 1.856261968612671, "learning_rate": 8.700749273663286e-05, "loss": 1.0124, "step": 2950 }, { "epoch": 0.1476023976023976, "grad_norm": 1.986202359199524, "learning_rate": 8.698200723788164e-05, "loss": 1.144, "step": 2955 }, { "epoch": 0.14785214785214784, "grad_norm": 2.092133045196533, "learning_rate": 8.695652173913044e-05, "loss": 1.0298, "step": 2960 }, { "epoch": 0.1481018981018981, "grad_norm": 2.0687754154205322, "learning_rate": 8.693103624037923e-05, "loss": 1.0596, "step": 2965 }, { "epoch": 0.14835164835164835, "grad_norm": 1.8726460933685303, "learning_rate": 8.690555074162801e-05, "loss": 1.127, "step": 2970 }, { "epoch": 0.1486013986013986, "grad_norm": 1.8351082801818848, "learning_rate": 8.688006524287681e-05, "loss": 1.1151, "step": 2975 }, { "epoch": 0.14885114885114886, "grad_norm": 1.6287287473678589, "learning_rate": 8.68545797441256e-05, "loss": 1.0895, "step": 2980 }, { "epoch": 0.1491008991008991, "grad_norm": 2.4128530025482178, "learning_rate": 8.682909424537438e-05, "loss": 1.051, "step": 2985 }, { "epoch": 0.14935064935064934, "grad_norm": 1.812265157699585, "learning_rate": 8.680360874662318e-05, "loss": 1.0419, "step": 2990 }, { "epoch": 0.14960039960039961, "grad_norm": 1.9314815998077393, "learning_rate": 8.677812324787197e-05, "loss": 1.1499, "step": 2995 }, { "epoch": 0.14985014985014986, "grad_norm": 1.964510440826416, "learning_rate": 8.675263774912075e-05, "loss": 1.0843, "step": 3000 }, { "epoch": 0.1500999000999001, "grad_norm": 1.6745145320892334, "learning_rate": 8.672715225036955e-05, "loss": 0.9663, "step": 3005 }, { "epoch": 0.15034965034965034, "grad_norm": 1.9032002687454224, "learning_rate": 8.670166675161834e-05, "loss": 1.0104, "step": 3010 }, { "epoch": 0.1505994005994006, "grad_norm": 1.8911412954330444, "learning_rate": 8.667618125286713e-05, "loss": 1.0728, "step": 3015 }, { "epoch": 0.15084915084915085, "grad_norm": 1.8938888311386108, "learning_rate": 8.66506957541159e-05, "loss": 1.112, "step": 3020 }, { "epoch": 0.1510989010989011, "grad_norm": 2.093853712081909, "learning_rate": 8.662521025536469e-05, "loss": 1.0787, "step": 3025 }, { "epoch": 0.15134865134865136, "grad_norm": 3.0604894161224365, "learning_rate": 8.659972475661349e-05, "loss": 1.1017, "step": 3030 }, { "epoch": 0.1515984015984016, "grad_norm": 1.8906617164611816, "learning_rate": 8.657423925786228e-05, "loss": 1.0234, "step": 3035 }, { "epoch": 0.15184815184815184, "grad_norm": 1.7201045751571655, "learning_rate": 8.654875375911106e-05, "loss": 0.9881, "step": 3040 }, { "epoch": 0.1520979020979021, "grad_norm": 1.6658756732940674, "learning_rate": 8.652326826035986e-05, "loss": 1.0421, "step": 3045 }, { "epoch": 0.15234765234765235, "grad_norm": 1.5572222471237183, "learning_rate": 8.649778276160865e-05, "loss": 1.1287, "step": 3050 }, { "epoch": 0.1525974025974026, "grad_norm": 2.1064367294311523, "learning_rate": 8.647229726285743e-05, "loss": 1.1015, "step": 3055 }, { "epoch": 0.15284715284715283, "grad_norm": 2.166884183883667, "learning_rate": 8.644681176410623e-05, "loss": 1.0627, "step": 3060 }, { "epoch": 0.1530969030969031, "grad_norm": 2.7731642723083496, "learning_rate": 8.642132626535502e-05, "loss": 1.117, "step": 3065 }, { "epoch": 0.15334665334665334, "grad_norm": 1.7246001958847046, "learning_rate": 8.639584076660381e-05, "loss": 1.0576, "step": 3070 }, { "epoch": 0.15359640359640359, "grad_norm": 2.19831919670105, "learning_rate": 8.63703552678526e-05, "loss": 1.0006, "step": 3075 }, { "epoch": 0.15384615384615385, "grad_norm": 1.8473360538482666, "learning_rate": 8.634486976910139e-05, "loss": 1.051, "step": 3080 }, { "epoch": 0.1540959040959041, "grad_norm": 1.9066402912139893, "learning_rate": 8.631938427035018e-05, "loss": 1.0656, "step": 3085 }, { "epoch": 0.15434565434565434, "grad_norm": 1.6816966533660889, "learning_rate": 8.629389877159897e-05, "loss": 1.0608, "step": 3090 }, { "epoch": 0.1545954045954046, "grad_norm": 2.2561936378479004, "learning_rate": 8.626841327284776e-05, "loss": 0.964, "step": 3095 }, { "epoch": 0.15484515484515485, "grad_norm": 2.5808048248291016, "learning_rate": 8.624292777409654e-05, "loss": 1.0112, "step": 3100 }, { "epoch": 0.1550949050949051, "grad_norm": 1.8227851390838623, "learning_rate": 8.621744227534533e-05, "loss": 0.9916, "step": 3105 }, { "epoch": 0.15534465534465536, "grad_norm": 1.7341548204421997, "learning_rate": 8.619195677659412e-05, "loss": 1.1069, "step": 3110 }, { "epoch": 0.1555944055944056, "grad_norm": 1.7087905406951904, "learning_rate": 8.616647127784291e-05, "loss": 1.0318, "step": 3115 }, { "epoch": 0.15584415584415584, "grad_norm": 1.7445971965789795, "learning_rate": 8.61409857790917e-05, "loss": 1.0484, "step": 3120 }, { "epoch": 0.15609390609390608, "grad_norm": 2.5312249660491943, "learning_rate": 8.61155002803405e-05, "loss": 1.0662, "step": 3125 }, { "epoch": 0.15634365634365635, "grad_norm": 1.638973355293274, "learning_rate": 8.609001478158928e-05, "loss": 1.0893, "step": 3130 }, { "epoch": 0.1565934065934066, "grad_norm": 2.396437883377075, "learning_rate": 8.606452928283807e-05, "loss": 1.0887, "step": 3135 }, { "epoch": 0.15684315684315683, "grad_norm": 2.3041741847991943, "learning_rate": 8.603904378408686e-05, "loss": 1.0811, "step": 3140 }, { "epoch": 0.1570929070929071, "grad_norm": 2.0702903270721436, "learning_rate": 8.601355828533565e-05, "loss": 1.1517, "step": 3145 }, { "epoch": 0.15734265734265734, "grad_norm": 2.859161376953125, "learning_rate": 8.598807278658444e-05, "loss": 1.0912, "step": 3150 }, { "epoch": 0.15759240759240759, "grad_norm": 2.1755731105804443, "learning_rate": 8.596258728783323e-05, "loss": 1.0059, "step": 3155 }, { "epoch": 0.15784215784215785, "grad_norm": 1.510913610458374, "learning_rate": 8.593710178908202e-05, "loss": 1.1188, "step": 3160 }, { "epoch": 0.1580919080919081, "grad_norm": 2.751120090484619, "learning_rate": 8.59116162903308e-05, "loss": 1.1801, "step": 3165 }, { "epoch": 0.15834165834165834, "grad_norm": 1.8940223455429077, "learning_rate": 8.58861307915796e-05, "loss": 1.0803, "step": 3170 }, { "epoch": 0.15859140859140858, "grad_norm": 1.7625248432159424, "learning_rate": 8.586064529282839e-05, "loss": 1.0491, "step": 3175 }, { "epoch": 0.15884115884115885, "grad_norm": 1.7603989839553833, "learning_rate": 8.583515979407717e-05, "loss": 1.1141, "step": 3180 }, { "epoch": 0.1590909090909091, "grad_norm": 2.1858510971069336, "learning_rate": 8.580967429532596e-05, "loss": 1.0791, "step": 3185 }, { "epoch": 0.15934065934065933, "grad_norm": 2.109264850616455, "learning_rate": 8.578418879657475e-05, "loss": 1.1461, "step": 3190 }, { "epoch": 0.1595904095904096, "grad_norm": 1.8282490968704224, "learning_rate": 8.575870329782354e-05, "loss": 1.0144, "step": 3195 }, { "epoch": 0.15984015984015984, "grad_norm": 1.7804405689239502, "learning_rate": 8.573321779907233e-05, "loss": 1.1303, "step": 3200 }, { "epoch": 0.16008991008991008, "grad_norm": 1.83059823513031, "learning_rate": 8.570773230032112e-05, "loss": 1.1063, "step": 3205 }, { "epoch": 0.16033966033966035, "grad_norm": 2.1611216068267822, "learning_rate": 8.568224680156991e-05, "loss": 1.1059, "step": 3210 }, { "epoch": 0.1605894105894106, "grad_norm": 1.923872947692871, "learning_rate": 8.56567613028187e-05, "loss": 1.0541, "step": 3215 }, { "epoch": 0.16083916083916083, "grad_norm": 1.9147241115570068, "learning_rate": 8.563127580406749e-05, "loss": 0.9315, "step": 3220 }, { "epoch": 0.1610889110889111, "grad_norm": 1.5878801345825195, "learning_rate": 8.560579030531628e-05, "loss": 1.0929, "step": 3225 }, { "epoch": 0.16133866133866134, "grad_norm": 2.5649254322052, "learning_rate": 8.558030480656507e-05, "loss": 0.9865, "step": 3230 }, { "epoch": 0.16158841158841158, "grad_norm": 2.0872910022735596, "learning_rate": 8.555481930781385e-05, "loss": 1.1166, "step": 3235 }, { "epoch": 0.16183816183816183, "grad_norm": 2.710996389389038, "learning_rate": 8.552933380906265e-05, "loss": 1.0471, "step": 3240 }, { "epoch": 0.1620879120879121, "grad_norm": 2.005072832107544, "learning_rate": 8.550384831031144e-05, "loss": 1.0343, "step": 3245 }, { "epoch": 0.16233766233766234, "grad_norm": 1.6735420227050781, "learning_rate": 8.547836281156022e-05, "loss": 0.9592, "step": 3250 }, { "epoch": 0.16258741258741258, "grad_norm": 2.3414783477783203, "learning_rate": 8.545287731280902e-05, "loss": 1.0646, "step": 3255 }, { "epoch": 0.16283716283716285, "grad_norm": 2.2394216060638428, "learning_rate": 8.54273918140578e-05, "loss": 1.0906, "step": 3260 }, { "epoch": 0.1630869130869131, "grad_norm": 1.691817283630371, "learning_rate": 8.54019063153066e-05, "loss": 1.0928, "step": 3265 }, { "epoch": 0.16333666333666333, "grad_norm": 1.6381685733795166, "learning_rate": 8.537642081655538e-05, "loss": 0.9607, "step": 3270 }, { "epoch": 0.1635864135864136, "grad_norm": 1.7203367948532104, "learning_rate": 8.535093531780417e-05, "loss": 1.0341, "step": 3275 }, { "epoch": 0.16383616383616384, "grad_norm": 1.761385202407837, "learning_rate": 8.532544981905296e-05, "loss": 1.0372, "step": 3280 }, { "epoch": 0.16408591408591408, "grad_norm": 1.6859697103500366, "learning_rate": 8.529996432030175e-05, "loss": 1.1296, "step": 3285 }, { "epoch": 0.16433566433566432, "grad_norm": 1.8559995889663696, "learning_rate": 8.527447882155053e-05, "loss": 1.0507, "step": 3290 }, { "epoch": 0.1645854145854146, "grad_norm": 2.7342402935028076, "learning_rate": 8.524899332279933e-05, "loss": 1.0227, "step": 3295 }, { "epoch": 0.16483516483516483, "grad_norm": 1.946282148361206, "learning_rate": 8.522350782404812e-05, "loss": 1.0352, "step": 3300 }, { "epoch": 0.16508491508491507, "grad_norm": 1.691779613494873, "learning_rate": 8.51980223252969e-05, "loss": 1.0534, "step": 3305 }, { "epoch": 0.16533466533466534, "grad_norm": 1.9294683933258057, "learning_rate": 8.51725368265457e-05, "loss": 1.0713, "step": 3310 }, { "epoch": 0.16558441558441558, "grad_norm": 1.865203857421875, "learning_rate": 8.514705132779449e-05, "loss": 1.0417, "step": 3315 }, { "epoch": 0.16583416583416583, "grad_norm": 1.716173529624939, "learning_rate": 8.512156582904329e-05, "loss": 1.1909, "step": 3320 }, { "epoch": 0.1660839160839161, "grad_norm": 1.9712934494018555, "learning_rate": 8.509608033029207e-05, "loss": 0.9583, "step": 3325 }, { "epoch": 0.16633366633366634, "grad_norm": 1.9177230596542358, "learning_rate": 8.507059483154086e-05, "loss": 1.1049, "step": 3330 }, { "epoch": 0.16658341658341658, "grad_norm": 1.7774606943130493, "learning_rate": 8.504510933278966e-05, "loss": 0.949, "step": 3335 }, { "epoch": 0.16683316683316685, "grad_norm": 1.610927700996399, "learning_rate": 8.501962383403843e-05, "loss": 1.1768, "step": 3340 }, { "epoch": 0.1670829170829171, "grad_norm": 1.4962278604507446, "learning_rate": 8.499413833528722e-05, "loss": 1.0231, "step": 3345 }, { "epoch": 0.16733266733266733, "grad_norm": 2.2448313236236572, "learning_rate": 8.496865283653601e-05, "loss": 1.0812, "step": 3350 }, { "epoch": 0.16758241758241757, "grad_norm": 1.8053752183914185, "learning_rate": 8.49431673377848e-05, "loss": 1.0818, "step": 3355 }, { "epoch": 0.16783216783216784, "grad_norm": 1.634582757949829, "learning_rate": 8.491768183903358e-05, "loss": 1.1459, "step": 3360 }, { "epoch": 0.16808191808191808, "grad_norm": 2.007207155227661, "learning_rate": 8.489219634028238e-05, "loss": 1.0501, "step": 3365 }, { "epoch": 0.16833166833166832, "grad_norm": 1.7548673152923584, "learning_rate": 8.486671084153117e-05, "loss": 1.0245, "step": 3370 }, { "epoch": 0.1685814185814186, "grad_norm": 1.8318586349487305, "learning_rate": 8.484122534277997e-05, "loss": 1.1044, "step": 3375 }, { "epoch": 0.16883116883116883, "grad_norm": 1.8122289180755615, "learning_rate": 8.481573984402875e-05, "loss": 1.1295, "step": 3380 }, { "epoch": 0.16908091908091907, "grad_norm": 1.8305327892303467, "learning_rate": 8.479025434527754e-05, "loss": 1.0808, "step": 3385 }, { "epoch": 0.16933066933066934, "grad_norm": 1.8058063983917236, "learning_rate": 8.476476884652634e-05, "loss": 0.9854, "step": 3390 }, { "epoch": 0.16958041958041958, "grad_norm": 1.9131430387496948, "learning_rate": 8.473928334777512e-05, "loss": 0.9668, "step": 3395 }, { "epoch": 0.16983016983016982, "grad_norm": 2.5325429439544678, "learning_rate": 8.471379784902391e-05, "loss": 0.9934, "step": 3400 }, { "epoch": 0.17007992007992007, "grad_norm": 2.0652947425842285, "learning_rate": 8.468831235027271e-05, "loss": 1.0939, "step": 3405 }, { "epoch": 0.17032967032967034, "grad_norm": 2.1790881156921387, "learning_rate": 8.466282685152149e-05, "loss": 0.9895, "step": 3410 }, { "epoch": 0.17057942057942058, "grad_norm": 1.9851411581039429, "learning_rate": 8.463734135277028e-05, "loss": 1.0233, "step": 3415 }, { "epoch": 0.17082917082917082, "grad_norm": 2.940673828125, "learning_rate": 8.461185585401906e-05, "loss": 1.0368, "step": 3420 }, { "epoch": 0.1710789210789211, "grad_norm": 1.9723538160324097, "learning_rate": 8.458637035526785e-05, "loss": 1.1108, "step": 3425 }, { "epoch": 0.17132867132867133, "grad_norm": 1.5980992317199707, "learning_rate": 8.456088485651665e-05, "loss": 1.1031, "step": 3430 }, { "epoch": 0.17157842157842157, "grad_norm": 1.9177820682525635, "learning_rate": 8.453539935776543e-05, "loss": 1.0212, "step": 3435 }, { "epoch": 0.17182817182817184, "grad_norm": 1.5302908420562744, "learning_rate": 8.450991385901422e-05, "loss": 1.0591, "step": 3440 }, { "epoch": 0.17207792207792208, "grad_norm": 2.517000913619995, "learning_rate": 8.448442836026302e-05, "loss": 1.0059, "step": 3445 }, { "epoch": 0.17232767232767232, "grad_norm": 1.826887607574463, "learning_rate": 8.44589428615118e-05, "loss": 1.2085, "step": 3450 }, { "epoch": 0.1725774225774226, "grad_norm": 1.7652828693389893, "learning_rate": 8.443345736276059e-05, "loss": 1.178, "step": 3455 }, { "epoch": 0.17282717282717283, "grad_norm": 2.174689292907715, "learning_rate": 8.440797186400939e-05, "loss": 1.0943, "step": 3460 }, { "epoch": 0.17307692307692307, "grad_norm": 1.9129114151000977, "learning_rate": 8.438248636525817e-05, "loss": 1.0339, "step": 3465 }, { "epoch": 0.17332667332667331, "grad_norm": 1.7708076238632202, "learning_rate": 8.435700086650696e-05, "loss": 0.9905, "step": 3470 }, { "epoch": 0.17357642357642358, "grad_norm": 1.674776554107666, "learning_rate": 8.433151536775576e-05, "loss": 1.0487, "step": 3475 }, { "epoch": 0.17382617382617382, "grad_norm": 1.7026817798614502, "learning_rate": 8.430602986900454e-05, "loss": 1.0355, "step": 3480 }, { "epoch": 0.17407592407592407, "grad_norm": 1.7904154062271118, "learning_rate": 8.428054437025333e-05, "loss": 1.0576, "step": 3485 }, { "epoch": 0.17432567432567433, "grad_norm": 1.4526289701461792, "learning_rate": 8.425505887150213e-05, "loss": 1.1072, "step": 3490 }, { "epoch": 0.17457542457542458, "grad_norm": 2.495004177093506, "learning_rate": 8.422957337275091e-05, "loss": 0.9544, "step": 3495 }, { "epoch": 0.17482517482517482, "grad_norm": 1.900423288345337, "learning_rate": 8.42040878739997e-05, "loss": 1.0771, "step": 3500 }, { "epoch": 0.1750749250749251, "grad_norm": 2.4944396018981934, "learning_rate": 8.417860237524848e-05, "loss": 1.0261, "step": 3505 }, { "epoch": 0.17532467532467533, "grad_norm": 2.2146127223968506, "learning_rate": 8.415311687649727e-05, "loss": 1.0667, "step": 3510 }, { "epoch": 0.17557442557442557, "grad_norm": 1.4351907968521118, "learning_rate": 8.412763137774607e-05, "loss": 0.9261, "step": 3515 }, { "epoch": 0.17582417582417584, "grad_norm": 1.7383787631988525, "learning_rate": 8.410214587899485e-05, "loss": 0.9325, "step": 3520 }, { "epoch": 0.17607392607392608, "grad_norm": 1.5712844133377075, "learning_rate": 8.407666038024364e-05, "loss": 1.1246, "step": 3525 }, { "epoch": 0.17632367632367632, "grad_norm": 1.989151954650879, "learning_rate": 8.405117488149244e-05, "loss": 1.1469, "step": 3530 }, { "epoch": 0.17657342657342656, "grad_norm": 1.8924086093902588, "learning_rate": 8.402568938274122e-05, "loss": 1.036, "step": 3535 }, { "epoch": 0.17682317682317683, "grad_norm": 2.1909124851226807, "learning_rate": 8.400020388399001e-05, "loss": 1.0093, "step": 3540 }, { "epoch": 0.17707292707292707, "grad_norm": 1.572777509689331, "learning_rate": 8.397471838523881e-05, "loss": 0.9813, "step": 3545 }, { "epoch": 0.1773226773226773, "grad_norm": 2.3108177185058594, "learning_rate": 8.394923288648759e-05, "loss": 1.0031, "step": 3550 }, { "epoch": 0.17757242757242758, "grad_norm": 1.7949233055114746, "learning_rate": 8.392374738773638e-05, "loss": 0.9928, "step": 3555 }, { "epoch": 0.17782217782217782, "grad_norm": 2.3124494552612305, "learning_rate": 8.389826188898518e-05, "loss": 1.0589, "step": 3560 }, { "epoch": 0.17807192807192807, "grad_norm": 1.6609785556793213, "learning_rate": 8.387277639023396e-05, "loss": 1.0988, "step": 3565 }, { "epoch": 0.17832167832167833, "grad_norm": 2.0280025005340576, "learning_rate": 8.384729089148275e-05, "loss": 0.9165, "step": 3570 }, { "epoch": 0.17857142857142858, "grad_norm": 2.3510186672210693, "learning_rate": 8.382180539273155e-05, "loss": 0.9395, "step": 3575 }, { "epoch": 0.17882117882117882, "grad_norm": 1.783249020576477, "learning_rate": 8.379631989398032e-05, "loss": 0.9629, "step": 3580 }, { "epoch": 0.17907092907092906, "grad_norm": 2.335078716278076, "learning_rate": 8.377083439522912e-05, "loss": 1.1034, "step": 3585 }, { "epoch": 0.17932067932067933, "grad_norm": 1.7579257488250732, "learning_rate": 8.37453488964779e-05, "loss": 1.054, "step": 3590 }, { "epoch": 0.17957042957042957, "grad_norm": 1.7958778142929077, "learning_rate": 8.371986339772669e-05, "loss": 1.1293, "step": 3595 }, { "epoch": 0.1798201798201798, "grad_norm": 1.8957895040512085, "learning_rate": 8.369437789897549e-05, "loss": 0.995, "step": 3600 }, { "epoch": 0.18006993006993008, "grad_norm": 1.40501868724823, "learning_rate": 8.366889240022427e-05, "loss": 0.9889, "step": 3605 }, { "epoch": 0.18031968031968032, "grad_norm": 2.025172472000122, "learning_rate": 8.364340690147306e-05, "loss": 1.0143, "step": 3610 }, { "epoch": 0.18056943056943056, "grad_norm": 1.4600145816802979, "learning_rate": 8.361792140272186e-05, "loss": 1.078, "step": 3615 }, { "epoch": 0.18081918081918083, "grad_norm": 1.9351260662078857, "learning_rate": 8.359243590397064e-05, "loss": 0.9516, "step": 3620 }, { "epoch": 0.18106893106893107, "grad_norm": 1.9423712491989136, "learning_rate": 8.356695040521944e-05, "loss": 1.0184, "step": 3625 }, { "epoch": 0.1813186813186813, "grad_norm": 1.8487287759780884, "learning_rate": 8.354146490646823e-05, "loss": 0.8985, "step": 3630 }, { "epoch": 0.18156843156843158, "grad_norm": 1.5882914066314697, "learning_rate": 8.351597940771701e-05, "loss": 1.0144, "step": 3635 }, { "epoch": 0.18181818181818182, "grad_norm": 1.70701265335083, "learning_rate": 8.349049390896581e-05, "loss": 0.9994, "step": 3640 }, { "epoch": 0.18206793206793206, "grad_norm": 1.7186760902404785, "learning_rate": 8.34650084102146e-05, "loss": 1.058, "step": 3645 }, { "epoch": 0.1823176823176823, "grad_norm": 1.384310245513916, "learning_rate": 8.343952291146338e-05, "loss": 0.9957, "step": 3650 }, { "epoch": 0.18256743256743257, "grad_norm": 2.5056235790252686, "learning_rate": 8.341403741271218e-05, "loss": 1.0358, "step": 3655 }, { "epoch": 0.18281718281718282, "grad_norm": 1.7599639892578125, "learning_rate": 8.338855191396095e-05, "loss": 1.1073, "step": 3660 }, { "epoch": 0.18306693306693306, "grad_norm": 1.7509809732437134, "learning_rate": 8.336306641520974e-05, "loss": 1.0117, "step": 3665 }, { "epoch": 0.18331668331668333, "grad_norm": 2.0392119884490967, "learning_rate": 8.333758091645854e-05, "loss": 1.077, "step": 3670 }, { "epoch": 0.18356643356643357, "grad_norm": 1.690687656402588, "learning_rate": 8.331209541770732e-05, "loss": 1.073, "step": 3675 }, { "epoch": 0.1838161838161838, "grad_norm": 1.593540906906128, "learning_rate": 8.328660991895612e-05, "loss": 1.1467, "step": 3680 }, { "epoch": 0.18406593406593408, "grad_norm": 1.8768028020858765, "learning_rate": 8.326112442020491e-05, "loss": 1.0151, "step": 3685 }, { "epoch": 0.18431568431568432, "grad_norm": 1.4606578350067139, "learning_rate": 8.323563892145369e-05, "loss": 1.0621, "step": 3690 }, { "epoch": 0.18456543456543456, "grad_norm": 1.7124439477920532, "learning_rate": 8.321015342270249e-05, "loss": 1.1264, "step": 3695 }, { "epoch": 0.1848151848151848, "grad_norm": 1.4532513618469238, "learning_rate": 8.318466792395128e-05, "loss": 1.0053, "step": 3700 }, { "epoch": 0.18506493506493507, "grad_norm": 2.188819646835327, "learning_rate": 8.315918242520006e-05, "loss": 1.0759, "step": 3705 }, { "epoch": 0.1853146853146853, "grad_norm": 2.3184454441070557, "learning_rate": 8.313369692644886e-05, "loss": 1.1019, "step": 3710 }, { "epoch": 0.18556443556443555, "grad_norm": 2.073598623275757, "learning_rate": 8.310821142769765e-05, "loss": 1.0901, "step": 3715 }, { "epoch": 0.18581418581418582, "grad_norm": 1.8204143047332764, "learning_rate": 8.308272592894643e-05, "loss": 1.083, "step": 3720 }, { "epoch": 0.18606393606393606, "grad_norm": 2.14795184135437, "learning_rate": 8.305724043019523e-05, "loss": 1.0737, "step": 3725 }, { "epoch": 0.1863136863136863, "grad_norm": 1.6617008447647095, "learning_rate": 8.303175493144402e-05, "loss": 1.0832, "step": 3730 }, { "epoch": 0.18656343656343657, "grad_norm": 1.7779147624969482, "learning_rate": 8.30062694326928e-05, "loss": 1.1132, "step": 3735 }, { "epoch": 0.18681318681318682, "grad_norm": 1.4784828424453735, "learning_rate": 8.298078393394159e-05, "loss": 1.1016, "step": 3740 }, { "epoch": 0.18706293706293706, "grad_norm": 1.686732530593872, "learning_rate": 8.295529843519037e-05, "loss": 1.0669, "step": 3745 }, { "epoch": 0.18731268731268733, "grad_norm": 1.8250025510787964, "learning_rate": 8.292981293643917e-05, "loss": 1.0322, "step": 3750 }, { "epoch": 0.18756243756243757, "grad_norm": 1.7952136993408203, "learning_rate": 8.290432743768796e-05, "loss": 1.0512, "step": 3755 }, { "epoch": 0.1878121878121878, "grad_norm": 1.7367311716079712, "learning_rate": 8.287884193893674e-05, "loss": 1.0351, "step": 3760 }, { "epoch": 0.18806193806193805, "grad_norm": 1.9317975044250488, "learning_rate": 8.285335644018554e-05, "loss": 1.1429, "step": 3765 }, { "epoch": 0.18831168831168832, "grad_norm": 1.9384158849716187, "learning_rate": 8.282787094143433e-05, "loss": 1.0144, "step": 3770 }, { "epoch": 0.18856143856143856, "grad_norm": 2.4914493560791016, "learning_rate": 8.280238544268311e-05, "loss": 1.039, "step": 3775 }, { "epoch": 0.1888111888111888, "grad_norm": 1.7811760902404785, "learning_rate": 8.277689994393191e-05, "loss": 1.0184, "step": 3780 }, { "epoch": 0.18906093906093907, "grad_norm": 2.1658356189727783, "learning_rate": 8.27514144451807e-05, "loss": 1.0237, "step": 3785 }, { "epoch": 0.1893106893106893, "grad_norm": 1.7575112581253052, "learning_rate": 8.272592894642948e-05, "loss": 1.099, "step": 3790 }, { "epoch": 0.18956043956043955, "grad_norm": 1.7569879293441772, "learning_rate": 8.270044344767828e-05, "loss": 1.1219, "step": 3795 }, { "epoch": 0.18981018981018982, "grad_norm": 1.9931334257125854, "learning_rate": 8.267495794892707e-05, "loss": 1.1326, "step": 3800 }, { "epoch": 0.19005994005994006, "grad_norm": 1.5611274242401123, "learning_rate": 8.264947245017585e-05, "loss": 1.0368, "step": 3805 }, { "epoch": 0.1903096903096903, "grad_norm": 1.7792850732803345, "learning_rate": 8.262398695142465e-05, "loss": 1.0783, "step": 3810 }, { "epoch": 0.19055944055944055, "grad_norm": 1.8555467128753662, "learning_rate": 8.259850145267344e-05, "loss": 0.972, "step": 3815 }, { "epoch": 0.19080919080919082, "grad_norm": 1.8904021978378296, "learning_rate": 8.257301595392222e-05, "loss": 1.0896, "step": 3820 }, { "epoch": 0.19105894105894106, "grad_norm": 2.2031562328338623, "learning_rate": 8.254753045517101e-05, "loss": 0.9914, "step": 3825 }, { "epoch": 0.1913086913086913, "grad_norm": 1.638953447341919, "learning_rate": 8.252204495641979e-05, "loss": 1.0053, "step": 3830 }, { "epoch": 0.19155844155844157, "grad_norm": 2.01694655418396, "learning_rate": 8.249655945766859e-05, "loss": 0.9854, "step": 3835 }, { "epoch": 0.1918081918081918, "grad_norm": 1.9305132627487183, "learning_rate": 8.247107395891738e-05, "loss": 1.0151, "step": 3840 }, { "epoch": 0.19205794205794205, "grad_norm": 2.931283473968506, "learning_rate": 8.244558846016616e-05, "loss": 1.0587, "step": 3845 }, { "epoch": 0.19230769230769232, "grad_norm": 2.157857656478882, "learning_rate": 8.242010296141496e-05, "loss": 0.9506, "step": 3850 }, { "epoch": 0.19255744255744256, "grad_norm": 1.901118278503418, "learning_rate": 8.239461746266375e-05, "loss": 1.1202, "step": 3855 }, { "epoch": 0.1928071928071928, "grad_norm": 1.406407356262207, "learning_rate": 8.236913196391253e-05, "loss": 1.0736, "step": 3860 }, { "epoch": 0.19305694305694307, "grad_norm": 2.073181629180908, "learning_rate": 8.234364646516133e-05, "loss": 1.0894, "step": 3865 }, { "epoch": 0.1933066933066933, "grad_norm": 1.5957709550857544, "learning_rate": 8.231816096641012e-05, "loss": 1.0845, "step": 3870 }, { "epoch": 0.19355644355644355, "grad_norm": 1.4995508193969727, "learning_rate": 8.22926754676589e-05, "loss": 1.087, "step": 3875 }, { "epoch": 0.1938061938061938, "grad_norm": 1.6431218385696411, "learning_rate": 8.22671899689077e-05, "loss": 1.0297, "step": 3880 }, { "epoch": 0.19405594405594406, "grad_norm": 2.4867990016937256, "learning_rate": 8.224170447015649e-05, "loss": 1.1753, "step": 3885 }, { "epoch": 0.1943056943056943, "grad_norm": 2.0314483642578125, "learning_rate": 8.221621897140529e-05, "loss": 1.0121, "step": 3890 }, { "epoch": 0.19455544455544455, "grad_norm": 2.5000531673431396, "learning_rate": 8.219073347265407e-05, "loss": 1.0051, "step": 3895 }, { "epoch": 0.19480519480519481, "grad_norm": 2.0018601417541504, "learning_rate": 8.216524797390284e-05, "loss": 0.9822, "step": 3900 }, { "epoch": 0.19505494505494506, "grad_norm": 2.2967371940612793, "learning_rate": 8.213976247515164e-05, "loss": 1.0578, "step": 3905 }, { "epoch": 0.1953046953046953, "grad_norm": 1.883610725402832, "learning_rate": 8.211427697640043e-05, "loss": 1.0213, "step": 3910 }, { "epoch": 0.19555444555444557, "grad_norm": 2.2703564167022705, "learning_rate": 8.208879147764921e-05, "loss": 1.0477, "step": 3915 }, { "epoch": 0.1958041958041958, "grad_norm": 1.6243152618408203, "learning_rate": 8.206330597889801e-05, "loss": 1.038, "step": 3920 }, { "epoch": 0.19605394605394605, "grad_norm": 1.652611494064331, "learning_rate": 8.20378204801468e-05, "loss": 0.987, "step": 3925 }, { "epoch": 0.1963036963036963, "grad_norm": 2.1214497089385986, "learning_rate": 8.20123349813956e-05, "loss": 1.0955, "step": 3930 }, { "epoch": 0.19655344655344656, "grad_norm": 1.705886721611023, "learning_rate": 8.198684948264438e-05, "loss": 1.106, "step": 3935 }, { "epoch": 0.1968031968031968, "grad_norm": 1.9270597696304321, "learning_rate": 8.196136398389317e-05, "loss": 1.0663, "step": 3940 }, { "epoch": 0.19705294705294704, "grad_norm": 1.868320107460022, "learning_rate": 8.193587848514197e-05, "loss": 1.0666, "step": 3945 }, { "epoch": 0.1973026973026973, "grad_norm": 1.4886274337768555, "learning_rate": 8.191039298639075e-05, "loss": 1.0718, "step": 3950 }, { "epoch": 0.19755244755244755, "grad_norm": 2.211674451828003, "learning_rate": 8.188490748763954e-05, "loss": 1.0443, "step": 3955 }, { "epoch": 0.1978021978021978, "grad_norm": 1.9535284042358398, "learning_rate": 8.185942198888833e-05, "loss": 1.0, "step": 3960 }, { "epoch": 0.19805194805194806, "grad_norm": 2.050697088241577, "learning_rate": 8.183393649013712e-05, "loss": 1.0802, "step": 3965 }, { "epoch": 0.1983016983016983, "grad_norm": 1.5371524095535278, "learning_rate": 8.18084509913859e-05, "loss": 1.0709, "step": 3970 }, { "epoch": 0.19855144855144854, "grad_norm": 1.8358978033065796, "learning_rate": 8.17829654926347e-05, "loss": 1.0659, "step": 3975 }, { "epoch": 0.19880119880119881, "grad_norm": 1.7615954875946045, "learning_rate": 8.175747999388348e-05, "loss": 1.0852, "step": 3980 }, { "epoch": 0.19905094905094906, "grad_norm": 1.837526559829712, "learning_rate": 8.173199449513228e-05, "loss": 1.1036, "step": 3985 }, { "epoch": 0.1993006993006993, "grad_norm": 1.9767099618911743, "learning_rate": 8.170650899638106e-05, "loss": 1.0658, "step": 3990 }, { "epoch": 0.19955044955044954, "grad_norm": 1.9533984661102295, "learning_rate": 8.168102349762985e-05, "loss": 1.1631, "step": 3995 }, { "epoch": 0.1998001998001998, "grad_norm": 1.8266781568527222, "learning_rate": 8.165553799887865e-05, "loss": 1.0926, "step": 4000 }, { "epoch": 0.20004995004995005, "grad_norm": 1.9316380023956299, "learning_rate": 8.163005250012743e-05, "loss": 1.0376, "step": 4005 }, { "epoch": 0.2002997002997003, "grad_norm": 2.016551971435547, "learning_rate": 8.160456700137622e-05, "loss": 1.0511, "step": 4010 }, { "epoch": 0.20054945054945056, "grad_norm": 1.8629343509674072, "learning_rate": 8.157908150262502e-05, "loss": 1.0397, "step": 4015 }, { "epoch": 0.2007992007992008, "grad_norm": 1.562517523765564, "learning_rate": 8.15535960038738e-05, "loss": 1.0193, "step": 4020 }, { "epoch": 0.20104895104895104, "grad_norm": 1.7502076625823975, "learning_rate": 8.152811050512259e-05, "loss": 1.1229, "step": 4025 }, { "epoch": 0.2012987012987013, "grad_norm": 1.7862657308578491, "learning_rate": 8.150262500637138e-05, "loss": 0.8968, "step": 4030 }, { "epoch": 0.20154845154845155, "grad_norm": 2.055274248123169, "learning_rate": 8.147713950762017e-05, "loss": 1.1997, "step": 4035 }, { "epoch": 0.2017982017982018, "grad_norm": 1.6465505361557007, "learning_rate": 8.145165400886896e-05, "loss": 0.9391, "step": 4040 }, { "epoch": 0.20204795204795203, "grad_norm": 2.0115177631378174, "learning_rate": 8.142616851011775e-05, "loss": 1.0077, "step": 4045 }, { "epoch": 0.2022977022977023, "grad_norm": 1.5239920616149902, "learning_rate": 8.140068301136654e-05, "loss": 0.8789, "step": 4050 }, { "epoch": 0.20254745254745254, "grad_norm": 2.1739954948425293, "learning_rate": 8.137519751261533e-05, "loss": 1.0539, "step": 4055 }, { "epoch": 0.20279720279720279, "grad_norm": 1.699087381362915, "learning_rate": 8.134971201386411e-05, "loss": 1.065, "step": 4060 }, { "epoch": 0.20304695304695305, "grad_norm": 1.8359402418136597, "learning_rate": 8.13242265151129e-05, "loss": 1.0583, "step": 4065 }, { "epoch": 0.2032967032967033, "grad_norm": 2.196523427963257, "learning_rate": 8.12987410163617e-05, "loss": 1.1577, "step": 4070 }, { "epoch": 0.20354645354645354, "grad_norm": 2.322420120239258, "learning_rate": 8.127325551761048e-05, "loss": 1.0687, "step": 4075 }, { "epoch": 0.2037962037962038, "grad_norm": 1.9669508934020996, "learning_rate": 8.124777001885927e-05, "loss": 0.9381, "step": 4080 }, { "epoch": 0.20404595404595405, "grad_norm": 1.6816198825836182, "learning_rate": 8.122228452010806e-05, "loss": 1.1214, "step": 4085 }, { "epoch": 0.2042957042957043, "grad_norm": 1.6130675077438354, "learning_rate": 8.119679902135685e-05, "loss": 0.9886, "step": 4090 }, { "epoch": 0.20454545454545456, "grad_norm": 1.5973551273345947, "learning_rate": 8.117131352260564e-05, "loss": 1.1397, "step": 4095 }, { "epoch": 0.2047952047952048, "grad_norm": 1.8123167753219604, "learning_rate": 8.114582802385443e-05, "loss": 1.07, "step": 4100 }, { "epoch": 0.20504495504495504, "grad_norm": 1.7340843677520752, "learning_rate": 8.112034252510322e-05, "loss": 1.1195, "step": 4105 }, { "epoch": 0.20529470529470528, "grad_norm": 1.7501031160354614, "learning_rate": 8.1094857026352e-05, "loss": 1.0362, "step": 4110 }, { "epoch": 0.20554445554445555, "grad_norm": 1.6506218910217285, "learning_rate": 8.10693715276008e-05, "loss": 1.0742, "step": 4115 }, { "epoch": 0.2057942057942058, "grad_norm": 1.8626444339752197, "learning_rate": 8.104388602884959e-05, "loss": 1.0385, "step": 4120 }, { "epoch": 0.20604395604395603, "grad_norm": 2.0279808044433594, "learning_rate": 8.101840053009838e-05, "loss": 1.0284, "step": 4125 }, { "epoch": 0.2062937062937063, "grad_norm": 1.750496506690979, "learning_rate": 8.099291503134717e-05, "loss": 0.9849, "step": 4130 }, { "epoch": 0.20654345654345654, "grad_norm": 1.843871831893921, "learning_rate": 8.096742953259596e-05, "loss": 1.0543, "step": 4135 }, { "epoch": 0.20679320679320679, "grad_norm": 1.8493355512619019, "learning_rate": 8.094194403384475e-05, "loss": 1.0511, "step": 4140 }, { "epoch": 0.20704295704295705, "grad_norm": 2.4423482418060303, "learning_rate": 8.091645853509353e-05, "loss": 1.1227, "step": 4145 }, { "epoch": 0.2072927072927073, "grad_norm": 2.29107403755188, "learning_rate": 8.089097303634232e-05, "loss": 1.0464, "step": 4150 }, { "epoch": 0.20754245754245754, "grad_norm": 2.4609811305999756, "learning_rate": 8.086548753759111e-05, "loss": 1.0666, "step": 4155 }, { "epoch": 0.2077922077922078, "grad_norm": 2.0379605293273926, "learning_rate": 8.08400020388399e-05, "loss": 0.9798, "step": 4160 }, { "epoch": 0.20804195804195805, "grad_norm": 1.377636432647705, "learning_rate": 8.081451654008869e-05, "loss": 0.9994, "step": 4165 }, { "epoch": 0.2082917082917083, "grad_norm": 1.9405335187911987, "learning_rate": 8.078903104133748e-05, "loss": 1.0745, "step": 4170 }, { "epoch": 0.20854145854145853, "grad_norm": 2.1919448375701904, "learning_rate": 8.076354554258627e-05, "loss": 0.9904, "step": 4175 }, { "epoch": 0.2087912087912088, "grad_norm": 1.9352673292160034, "learning_rate": 8.073806004383506e-05, "loss": 1.0876, "step": 4180 }, { "epoch": 0.20904095904095904, "grad_norm": 2.3606743812561035, "learning_rate": 8.071257454508385e-05, "loss": 1.075, "step": 4185 }, { "epoch": 0.20929070929070928, "grad_norm": 1.544116497039795, "learning_rate": 8.068708904633264e-05, "loss": 1.1324, "step": 4190 }, { "epoch": 0.20954045954045955, "grad_norm": 1.5757083892822266, "learning_rate": 8.066160354758144e-05, "loss": 1.0104, "step": 4195 }, { "epoch": 0.2097902097902098, "grad_norm": 1.5504379272460938, "learning_rate": 8.063611804883022e-05, "loss": 0.9976, "step": 4200 }, { "epoch": 0.21003996003996003, "grad_norm": 1.950480580329895, "learning_rate": 8.061063255007901e-05, "loss": 0.9701, "step": 4205 }, { "epoch": 0.2102897102897103, "grad_norm": 2.0043821334838867, "learning_rate": 8.058514705132781e-05, "loss": 1.017, "step": 4210 }, { "epoch": 0.21053946053946054, "grad_norm": 1.8639042377471924, "learning_rate": 8.05596615525766e-05, "loss": 1.0543, "step": 4215 }, { "epoch": 0.21078921078921078, "grad_norm": 1.813562273979187, "learning_rate": 8.053417605382537e-05, "loss": 1.0568, "step": 4220 }, { "epoch": 0.21103896103896103, "grad_norm": 2.5554585456848145, "learning_rate": 8.050869055507416e-05, "loss": 1.0325, "step": 4225 }, { "epoch": 0.2112887112887113, "grad_norm": 1.7976969480514526, "learning_rate": 8.048320505632295e-05, "loss": 1.0858, "step": 4230 }, { "epoch": 0.21153846153846154, "grad_norm": 1.8200702667236328, "learning_rate": 8.045771955757175e-05, "loss": 1.1015, "step": 4235 }, { "epoch": 0.21178821178821178, "grad_norm": 2.6109964847564697, "learning_rate": 8.043223405882053e-05, "loss": 1.036, "step": 4240 }, { "epoch": 0.21203796203796205, "grad_norm": 1.7562363147735596, "learning_rate": 8.040674856006932e-05, "loss": 1.0699, "step": 4245 }, { "epoch": 0.2122877122877123, "grad_norm": 2.31793212890625, "learning_rate": 8.038126306131812e-05, "loss": 1.0447, "step": 4250 }, { "epoch": 0.21253746253746253, "grad_norm": 1.8158233165740967, "learning_rate": 8.03557775625669e-05, "loss": 1.0646, "step": 4255 }, { "epoch": 0.2127872127872128, "grad_norm": 4.276870250701904, "learning_rate": 8.033029206381569e-05, "loss": 1.0511, "step": 4260 }, { "epoch": 0.21303696303696304, "grad_norm": 1.4953745603561401, "learning_rate": 8.030480656506449e-05, "loss": 1.1419, "step": 4265 }, { "epoch": 0.21328671328671328, "grad_norm": 1.7403877973556519, "learning_rate": 8.027932106631327e-05, "loss": 1.0674, "step": 4270 }, { "epoch": 0.21353646353646355, "grad_norm": 1.7788101434707642, "learning_rate": 8.025383556756206e-05, "loss": 1.056, "step": 4275 }, { "epoch": 0.2137862137862138, "grad_norm": 2.376905679702759, "learning_rate": 8.022835006881086e-05, "loss": 1.044, "step": 4280 }, { "epoch": 0.21403596403596403, "grad_norm": 2.0632026195526123, "learning_rate": 8.020286457005964e-05, "loss": 1.0626, "step": 4285 }, { "epoch": 0.21428571428571427, "grad_norm": 1.5301294326782227, "learning_rate": 8.017737907130843e-05, "loss": 1.0004, "step": 4290 }, { "epoch": 0.21453546453546454, "grad_norm": 1.616371989250183, "learning_rate": 8.015189357255723e-05, "loss": 0.9899, "step": 4295 }, { "epoch": 0.21478521478521478, "grad_norm": 1.684002161026001, "learning_rate": 8.0126408073806e-05, "loss": 1.1461, "step": 4300 }, { "epoch": 0.21503496503496503, "grad_norm": 3.2268154621124268, "learning_rate": 8.01009225750548e-05, "loss": 1.0305, "step": 4305 }, { "epoch": 0.2152847152847153, "grad_norm": 1.5656287670135498, "learning_rate": 8.007543707630358e-05, "loss": 1.043, "step": 4310 }, { "epoch": 0.21553446553446554, "grad_norm": 2.3049917221069336, "learning_rate": 8.004995157755237e-05, "loss": 0.9987, "step": 4315 }, { "epoch": 0.21578421578421578, "grad_norm": 2.0060033798217773, "learning_rate": 8.002446607880117e-05, "loss": 1.0977, "step": 4320 }, { "epoch": 0.21603396603396605, "grad_norm": 3.4175117015838623, "learning_rate": 7.999898058004995e-05, "loss": 1.0653, "step": 4325 }, { "epoch": 0.2162837162837163, "grad_norm": 1.8823119401931763, "learning_rate": 7.997349508129874e-05, "loss": 0.9673, "step": 4330 }, { "epoch": 0.21653346653346653, "grad_norm": 1.668336033821106, "learning_rate": 7.994800958254754e-05, "loss": 0.9621, "step": 4335 }, { "epoch": 0.21678321678321677, "grad_norm": 1.8225713968276978, "learning_rate": 7.992252408379632e-05, "loss": 1.0324, "step": 4340 }, { "epoch": 0.21703296703296704, "grad_norm": 1.8834762573242188, "learning_rate": 7.989703858504511e-05, "loss": 1.0697, "step": 4345 }, { "epoch": 0.21728271728271728, "grad_norm": 1.6210649013519287, "learning_rate": 7.987155308629391e-05, "loss": 1.1199, "step": 4350 }, { "epoch": 0.21753246753246752, "grad_norm": 2.5299785137176514, "learning_rate": 7.98460675875427e-05, "loss": 1.0196, "step": 4355 }, { "epoch": 0.2177822177822178, "grad_norm": 1.6540402173995972, "learning_rate": 7.982058208879148e-05, "loss": 1.0907, "step": 4360 }, { "epoch": 0.21803196803196803, "grad_norm": 2.0360190868377686, "learning_rate": 7.979509659004028e-05, "loss": 0.9976, "step": 4365 }, { "epoch": 0.21828171828171827, "grad_norm": 3.381786584854126, "learning_rate": 7.976961109128906e-05, "loss": 1.0484, "step": 4370 }, { "epoch": 0.21853146853146854, "grad_norm": 1.6287943124771118, "learning_rate": 7.974412559253785e-05, "loss": 0.9781, "step": 4375 }, { "epoch": 0.21878121878121878, "grad_norm": 1.766602873802185, "learning_rate": 7.971864009378663e-05, "loss": 1.0043, "step": 4380 }, { "epoch": 0.21903096903096902, "grad_norm": 1.895731806755066, "learning_rate": 7.969315459503542e-05, "loss": 0.9641, "step": 4385 }, { "epoch": 0.2192807192807193, "grad_norm": 2.3229100704193115, "learning_rate": 7.966766909628422e-05, "loss": 0.9985, "step": 4390 }, { "epoch": 0.21953046953046954, "grad_norm": 2.228792190551758, "learning_rate": 7.9642183597533e-05, "loss": 1.0701, "step": 4395 }, { "epoch": 0.21978021978021978, "grad_norm": 2.216813325881958, "learning_rate": 7.961669809878179e-05, "loss": 1.0631, "step": 4400 }, { "epoch": 0.22002997002997002, "grad_norm": 2.3935205936431885, "learning_rate": 7.959121260003059e-05, "loss": 1.1115, "step": 4405 }, { "epoch": 0.2202797202797203, "grad_norm": 2.630929946899414, "learning_rate": 7.956572710127937e-05, "loss": 1.0321, "step": 4410 }, { "epoch": 0.22052947052947053, "grad_norm": 1.8631958961486816, "learning_rate": 7.954024160252816e-05, "loss": 1.082, "step": 4415 }, { "epoch": 0.22077922077922077, "grad_norm": 2.335817813873291, "learning_rate": 7.951475610377696e-05, "loss": 1.1252, "step": 4420 }, { "epoch": 0.22102897102897104, "grad_norm": 2.0801730155944824, "learning_rate": 7.948927060502574e-05, "loss": 0.9814, "step": 4425 }, { "epoch": 0.22127872127872128, "grad_norm": 1.6793395280838013, "learning_rate": 7.946378510627453e-05, "loss": 1.0482, "step": 4430 }, { "epoch": 0.22152847152847152, "grad_norm": 1.923393726348877, "learning_rate": 7.943829960752333e-05, "loss": 1.0391, "step": 4435 }, { "epoch": 0.2217782217782218, "grad_norm": 1.673514723777771, "learning_rate": 7.941281410877211e-05, "loss": 0.9687, "step": 4440 }, { "epoch": 0.22202797202797203, "grad_norm": 1.7573161125183105, "learning_rate": 7.938732861002091e-05, "loss": 0.9793, "step": 4445 }, { "epoch": 0.22227772227772227, "grad_norm": 1.8152977228164673, "learning_rate": 7.93618431112697e-05, "loss": 1.0443, "step": 4450 }, { "epoch": 0.22252747252747251, "grad_norm": 1.9609544277191162, "learning_rate": 7.933635761251848e-05, "loss": 0.9712, "step": 4455 }, { "epoch": 0.22277722277722278, "grad_norm": 2.098844051361084, "learning_rate": 7.931087211376727e-05, "loss": 1.0192, "step": 4460 }, { "epoch": 0.22302697302697302, "grad_norm": 1.97663414478302, "learning_rate": 7.928538661501605e-05, "loss": 1.0328, "step": 4465 }, { "epoch": 0.22327672327672327, "grad_norm": 2.6888411045074463, "learning_rate": 7.925990111626484e-05, "loss": 1.143, "step": 4470 }, { "epoch": 0.22352647352647353, "grad_norm": 2.2202484607696533, "learning_rate": 7.923441561751364e-05, "loss": 0.9739, "step": 4475 }, { "epoch": 0.22377622377622378, "grad_norm": 2.1188807487487793, "learning_rate": 7.920893011876242e-05, "loss": 0.9793, "step": 4480 }, { "epoch": 0.22402597402597402, "grad_norm": 1.6401931047439575, "learning_rate": 7.918344462001121e-05, "loss": 1.1832, "step": 4485 }, { "epoch": 0.2242757242757243, "grad_norm": 1.8669472932815552, "learning_rate": 7.915795912126001e-05, "loss": 1.0676, "step": 4490 }, { "epoch": 0.22452547452547453, "grad_norm": 1.8524556159973145, "learning_rate": 7.91324736225088e-05, "loss": 1.0484, "step": 4495 }, { "epoch": 0.22477522477522477, "grad_norm": 1.7987474203109741, "learning_rate": 7.910698812375759e-05, "loss": 1.02, "step": 4500 }, { "epoch": 0.22502497502497504, "grad_norm": 1.4022389650344849, "learning_rate": 7.908150262500638e-05, "loss": 1.0758, "step": 4505 }, { "epoch": 0.22527472527472528, "grad_norm": 1.4945101737976074, "learning_rate": 7.905601712625516e-05, "loss": 1.0385, "step": 4510 }, { "epoch": 0.22552447552447552, "grad_norm": 1.682323694229126, "learning_rate": 7.903053162750396e-05, "loss": 1.0589, "step": 4515 }, { "epoch": 0.22577422577422576, "grad_norm": 2.282167434692383, "learning_rate": 7.900504612875275e-05, "loss": 0.9866, "step": 4520 }, { "epoch": 0.22602397602397603, "grad_norm": 1.5892890691757202, "learning_rate": 7.897956063000153e-05, "loss": 1.0501, "step": 4525 }, { "epoch": 0.22627372627372627, "grad_norm": 2.016310453414917, "learning_rate": 7.895407513125033e-05, "loss": 1.0975, "step": 4530 }, { "epoch": 0.2265234765234765, "grad_norm": 1.6020146608352661, "learning_rate": 7.892858963249912e-05, "loss": 1.0391, "step": 4535 }, { "epoch": 0.22677322677322678, "grad_norm": 2.4940216541290283, "learning_rate": 7.890310413374789e-05, "loss": 0.9954, "step": 4540 }, { "epoch": 0.22702297702297702, "grad_norm": 2.1699228286743164, "learning_rate": 7.887761863499669e-05, "loss": 1.1688, "step": 4545 }, { "epoch": 0.22727272727272727, "grad_norm": 2.629871129989624, "learning_rate": 7.885213313624547e-05, "loss": 1.1588, "step": 4550 }, { "epoch": 0.22752247752247753, "grad_norm": 1.8032234907150269, "learning_rate": 7.882664763749427e-05, "loss": 1.2314, "step": 4555 }, { "epoch": 0.22777222777222778, "grad_norm": 1.9101934432983398, "learning_rate": 7.880116213874306e-05, "loss": 1.1114, "step": 4560 }, { "epoch": 0.22802197802197802, "grad_norm": 1.7841362953186035, "learning_rate": 7.877567663999184e-05, "loss": 0.9949, "step": 4565 }, { "epoch": 0.22827172827172826, "grad_norm": 1.5989936590194702, "learning_rate": 7.875019114124064e-05, "loss": 0.9949, "step": 4570 }, { "epoch": 0.22852147852147853, "grad_norm": 1.6785179376602173, "learning_rate": 7.872470564248943e-05, "loss": 1.0158, "step": 4575 }, { "epoch": 0.22877122877122877, "grad_norm": 1.7461285591125488, "learning_rate": 7.869922014373821e-05, "loss": 1.011, "step": 4580 }, { "epoch": 0.229020979020979, "grad_norm": 1.5746034383773804, "learning_rate": 7.867373464498701e-05, "loss": 1.0706, "step": 4585 }, { "epoch": 0.22927072927072928, "grad_norm": 3.787637233734131, "learning_rate": 7.86482491462358e-05, "loss": 1.0569, "step": 4590 }, { "epoch": 0.22952047952047952, "grad_norm": 1.5961445569992065, "learning_rate": 7.862276364748458e-05, "loss": 1.041, "step": 4595 }, { "epoch": 0.22977022977022976, "grad_norm": 1.8055496215820312, "learning_rate": 7.859727814873338e-05, "loss": 0.9403, "step": 4600 }, { "epoch": 0.23001998001998003, "grad_norm": 1.585745930671692, "learning_rate": 7.857179264998217e-05, "loss": 1.1289, "step": 4605 }, { "epoch": 0.23026973026973027, "grad_norm": 1.64175283908844, "learning_rate": 7.854630715123095e-05, "loss": 1.0588, "step": 4610 }, { "epoch": 0.2305194805194805, "grad_norm": 1.5917845964431763, "learning_rate": 7.852082165247975e-05, "loss": 1.0004, "step": 4615 }, { "epoch": 0.23076923076923078, "grad_norm": 1.6781035661697388, "learning_rate": 7.849533615372852e-05, "loss": 1.0661, "step": 4620 }, { "epoch": 0.23101898101898102, "grad_norm": 2.06455659866333, "learning_rate": 7.846985065497732e-05, "loss": 1.0031, "step": 4625 }, { "epoch": 0.23126873126873126, "grad_norm": 1.8574366569519043, "learning_rate": 7.844436515622611e-05, "loss": 1.1202, "step": 4630 }, { "epoch": 0.2315184815184815, "grad_norm": 2.1858973503112793, "learning_rate": 7.84188796574749e-05, "loss": 0.9922, "step": 4635 }, { "epoch": 0.23176823176823177, "grad_norm": 1.9599342346191406, "learning_rate": 7.839339415872369e-05, "loss": 0.9786, "step": 4640 }, { "epoch": 0.23201798201798202, "grad_norm": 2.0038461685180664, "learning_rate": 7.836790865997248e-05, "loss": 0.9464, "step": 4645 }, { "epoch": 0.23226773226773226, "grad_norm": 2.0661306381225586, "learning_rate": 7.834242316122126e-05, "loss": 0.9855, "step": 4650 }, { "epoch": 0.23251748251748253, "grad_norm": 1.865768551826477, "learning_rate": 7.831693766247006e-05, "loss": 0.9497, "step": 4655 }, { "epoch": 0.23276723276723277, "grad_norm": 1.8975129127502441, "learning_rate": 7.829145216371885e-05, "loss": 0.9388, "step": 4660 }, { "epoch": 0.233016983016983, "grad_norm": 2.1126043796539307, "learning_rate": 7.826596666496763e-05, "loss": 1.0837, "step": 4665 }, { "epoch": 0.23326673326673328, "grad_norm": 2.2528364658355713, "learning_rate": 7.824048116621643e-05, "loss": 1.0028, "step": 4670 }, { "epoch": 0.23351648351648352, "grad_norm": 2.0935659408569336, "learning_rate": 7.821499566746522e-05, "loss": 0.9653, "step": 4675 }, { "epoch": 0.23376623376623376, "grad_norm": 2.1829593181610107, "learning_rate": 7.8189510168714e-05, "loss": 1.0408, "step": 4680 }, { "epoch": 0.234015984015984, "grad_norm": 3.08906626701355, "learning_rate": 7.81640246699628e-05, "loss": 0.9565, "step": 4685 }, { "epoch": 0.23426573426573427, "grad_norm": 2.123894453048706, "learning_rate": 7.813853917121159e-05, "loss": 1.0668, "step": 4690 }, { "epoch": 0.2345154845154845, "grad_norm": 1.8171356916427612, "learning_rate": 7.811305367246037e-05, "loss": 1.127, "step": 4695 }, { "epoch": 0.23476523476523475, "grad_norm": 4.807708263397217, "learning_rate": 7.808756817370916e-05, "loss": 0.9487, "step": 4700 }, { "epoch": 0.23501498501498502, "grad_norm": 3.0979394912719727, "learning_rate": 7.806208267495794e-05, "loss": 1.1091, "step": 4705 }, { "epoch": 0.23526473526473526, "grad_norm": 2.7720797061920166, "learning_rate": 7.803659717620674e-05, "loss": 1.114, "step": 4710 }, { "epoch": 0.2355144855144855, "grad_norm": 2.019779682159424, "learning_rate": 7.801111167745553e-05, "loss": 0.9419, "step": 4715 }, { "epoch": 0.23576423576423577, "grad_norm": 2.7700259685516357, "learning_rate": 7.798562617870431e-05, "loss": 1.1492, "step": 4720 }, { "epoch": 0.23601398601398602, "grad_norm": 1.5744861364364624, "learning_rate": 7.796014067995311e-05, "loss": 1.0617, "step": 4725 }, { "epoch": 0.23626373626373626, "grad_norm": 1.6773059368133545, "learning_rate": 7.79346551812019e-05, "loss": 1.0907, "step": 4730 }, { "epoch": 0.23651348651348653, "grad_norm": 2.045384168624878, "learning_rate": 7.790916968245068e-05, "loss": 0.8725, "step": 4735 }, { "epoch": 0.23676323676323677, "grad_norm": 2.083090305328369, "learning_rate": 7.788368418369948e-05, "loss": 1.0596, "step": 4740 }, { "epoch": 0.237012987012987, "grad_norm": 1.7435014247894287, "learning_rate": 7.785819868494827e-05, "loss": 1.052, "step": 4745 }, { "epoch": 0.23726273726273725, "grad_norm": 1.9300342798233032, "learning_rate": 7.783271318619707e-05, "loss": 1.1876, "step": 4750 }, { "epoch": 0.23751248751248752, "grad_norm": 2.0810794830322266, "learning_rate": 7.780722768744585e-05, "loss": 1.0234, "step": 4755 }, { "epoch": 0.23776223776223776, "grad_norm": 2.0376906394958496, "learning_rate": 7.778174218869464e-05, "loss": 1.0499, "step": 4760 }, { "epoch": 0.238011988011988, "grad_norm": 4.161076068878174, "learning_rate": 7.775625668994344e-05, "loss": 1.0155, "step": 4765 }, { "epoch": 0.23826173826173827, "grad_norm": 1.7556109428405762, "learning_rate": 7.773077119119222e-05, "loss": 1.0959, "step": 4770 }, { "epoch": 0.2385114885114885, "grad_norm": 1.739386796951294, "learning_rate": 7.7705285692441e-05, "loss": 1.0784, "step": 4775 }, { "epoch": 0.23876123876123875, "grad_norm": 1.78800368309021, "learning_rate": 7.767980019368979e-05, "loss": 1.075, "step": 4780 }, { "epoch": 0.23901098901098902, "grad_norm": 1.8468023538589478, "learning_rate": 7.765431469493858e-05, "loss": 0.9857, "step": 4785 }, { "epoch": 0.23926073926073926, "grad_norm": 2.289717197418213, "learning_rate": 7.762882919618736e-05, "loss": 1.0321, "step": 4790 }, { "epoch": 0.2395104895104895, "grad_norm": 1.5225098133087158, "learning_rate": 7.760334369743616e-05, "loss": 1.0647, "step": 4795 }, { "epoch": 0.23976023976023977, "grad_norm": 2.049543619155884, "learning_rate": 7.757785819868495e-05, "loss": 0.9598, "step": 4800 }, { "epoch": 0.24000999000999002, "grad_norm": 2.5328476428985596, "learning_rate": 7.755237269993375e-05, "loss": 1.0665, "step": 4805 }, { "epoch": 0.24025974025974026, "grad_norm": 2.4472174644470215, "learning_rate": 7.752688720118253e-05, "loss": 1.1015, "step": 4810 }, { "epoch": 0.2405094905094905, "grad_norm": 1.5198802947998047, "learning_rate": 7.750140170243132e-05, "loss": 1.0643, "step": 4815 }, { "epoch": 0.24075924075924077, "grad_norm": 2.4154412746429443, "learning_rate": 7.747591620368012e-05, "loss": 1.0457, "step": 4820 }, { "epoch": 0.241008991008991, "grad_norm": 1.5116082429885864, "learning_rate": 7.74504307049289e-05, "loss": 0.9867, "step": 4825 }, { "epoch": 0.24125874125874125, "grad_norm": 2.0457067489624023, "learning_rate": 7.742494520617769e-05, "loss": 0.9908, "step": 4830 }, { "epoch": 0.24150849150849152, "grad_norm": 2.508091688156128, "learning_rate": 7.739945970742649e-05, "loss": 1.0048, "step": 4835 }, { "epoch": 0.24175824175824176, "grad_norm": 1.9757366180419922, "learning_rate": 7.737397420867527e-05, "loss": 1.06, "step": 4840 }, { "epoch": 0.242007992007992, "grad_norm": 1.6665515899658203, "learning_rate": 7.734848870992406e-05, "loss": 1.0564, "step": 4845 }, { "epoch": 0.24225774225774227, "grad_norm": 1.6706490516662598, "learning_rate": 7.732300321117286e-05, "loss": 1.0741, "step": 4850 }, { "epoch": 0.2425074925074925, "grad_norm": 2.276467800140381, "learning_rate": 7.729751771242164e-05, "loss": 1.0205, "step": 4855 }, { "epoch": 0.24275724275724275, "grad_norm": 2.863227367401123, "learning_rate": 7.727203221367043e-05, "loss": 0.9817, "step": 4860 }, { "epoch": 0.243006993006993, "grad_norm": 1.9697482585906982, "learning_rate": 7.724654671491921e-05, "loss": 1.0609, "step": 4865 }, { "epoch": 0.24325674325674326, "grad_norm": 2.432643175125122, "learning_rate": 7.7221061216168e-05, "loss": 1.0315, "step": 4870 }, { "epoch": 0.2435064935064935, "grad_norm": 2.3304364681243896, "learning_rate": 7.71955757174168e-05, "loss": 1.0683, "step": 4875 }, { "epoch": 0.24375624375624375, "grad_norm": 1.7669438123703003, "learning_rate": 7.717009021866558e-05, "loss": 1.0352, "step": 4880 }, { "epoch": 0.24400599400599401, "grad_norm": 2.390686273574829, "learning_rate": 7.714460471991437e-05, "loss": 1.0687, "step": 4885 }, { "epoch": 0.24425574425574426, "grad_norm": 2.1844141483306885, "learning_rate": 7.711911922116317e-05, "loss": 0.9465, "step": 4890 }, { "epoch": 0.2445054945054945, "grad_norm": 2.180518865585327, "learning_rate": 7.709363372241195e-05, "loss": 1.0369, "step": 4895 }, { "epoch": 0.24475524475524477, "grad_norm": 1.9334185123443604, "learning_rate": 7.706814822366074e-05, "loss": 1.0276, "step": 4900 }, { "epoch": 0.245004995004995, "grad_norm": 2.200727701187134, "learning_rate": 7.704266272490954e-05, "loss": 1.0477, "step": 4905 }, { "epoch": 0.24525474525474525, "grad_norm": 1.675630807876587, "learning_rate": 7.701717722615832e-05, "loss": 1.0504, "step": 4910 }, { "epoch": 0.24550449550449552, "grad_norm": 2.139488458633423, "learning_rate": 7.69916917274071e-05, "loss": 1.107, "step": 4915 }, { "epoch": 0.24575424575424576, "grad_norm": 1.6376665830612183, "learning_rate": 7.69662062286559e-05, "loss": 1.032, "step": 4920 }, { "epoch": 0.246003996003996, "grad_norm": 2.001588821411133, "learning_rate": 7.694072072990469e-05, "loss": 1.0137, "step": 4925 }, { "epoch": 0.24625374625374624, "grad_norm": 1.5129578113555908, "learning_rate": 7.691523523115348e-05, "loss": 0.9912, "step": 4930 }, { "epoch": 0.2465034965034965, "grad_norm": 2.4645586013793945, "learning_rate": 7.688974973240228e-05, "loss": 0.9493, "step": 4935 }, { "epoch": 0.24675324675324675, "grad_norm": 1.5370393991470337, "learning_rate": 7.686426423365105e-05, "loss": 1.0871, "step": 4940 }, { "epoch": 0.247002997002997, "grad_norm": 2.0858418941497803, "learning_rate": 7.683877873489985e-05, "loss": 1.0421, "step": 4945 }, { "epoch": 0.24725274725274726, "grad_norm": 2.1412692070007324, "learning_rate": 7.681329323614863e-05, "loss": 1.0956, "step": 4950 }, { "epoch": 0.2475024975024975, "grad_norm": 2.117612600326538, "learning_rate": 7.678780773739742e-05, "loss": 1.029, "step": 4955 }, { "epoch": 0.24775224775224775, "grad_norm": 1.7101547718048096, "learning_rate": 7.676232223864622e-05, "loss": 1.0589, "step": 4960 }, { "epoch": 0.24800199800199801, "grad_norm": 2.7010884284973145, "learning_rate": 7.6736836739895e-05, "loss": 0.9574, "step": 4965 }, { "epoch": 0.24825174825174826, "grad_norm": 1.995436668395996, "learning_rate": 7.671135124114379e-05, "loss": 1.0411, "step": 4970 }, { "epoch": 0.2485014985014985, "grad_norm": 1.8976463079452515, "learning_rate": 7.668586574239259e-05, "loss": 1.0917, "step": 4975 }, { "epoch": 0.24875124875124874, "grad_norm": 1.4475399255752563, "learning_rate": 7.666038024364137e-05, "loss": 1.0927, "step": 4980 }, { "epoch": 0.249000999000999, "grad_norm": 1.6889349222183228, "learning_rate": 7.663489474489016e-05, "loss": 1.0831, "step": 4985 }, { "epoch": 0.24925074925074925, "grad_norm": 1.4610087871551514, "learning_rate": 7.660940924613896e-05, "loss": 1.034, "step": 4990 }, { "epoch": 0.2495004995004995, "grad_norm": 2.5302772521972656, "learning_rate": 7.658392374738774e-05, "loss": 0.9827, "step": 4995 }, { "epoch": 0.24975024975024976, "grad_norm": 2.066997766494751, "learning_rate": 7.655843824863653e-05, "loss": 1.0647, "step": 5000 }, { "epoch": 0.25, "grad_norm": 1.8916367292404175, "learning_rate": 7.653295274988532e-05, "loss": 0.9901, "step": 5005 }, { "epoch": 0.25024975024975027, "grad_norm": 2.3649702072143555, "learning_rate": 7.650746725113411e-05, "loss": 0.8058, "step": 5010 }, { "epoch": 0.2504995004995005, "grad_norm": 2.155064344406128, "learning_rate": 7.648198175238291e-05, "loss": 0.979, "step": 5015 }, { "epoch": 0.25074925074925075, "grad_norm": 1.806206226348877, "learning_rate": 7.645649625363168e-05, "loss": 0.9957, "step": 5020 }, { "epoch": 0.250999000999001, "grad_norm": 1.709810733795166, "learning_rate": 7.643101075488047e-05, "loss": 1.001, "step": 5025 }, { "epoch": 0.25124875124875123, "grad_norm": 2.850813627243042, "learning_rate": 7.640552525612927e-05, "loss": 1.0545, "step": 5030 }, { "epoch": 0.2514985014985015, "grad_norm": 2.100074291229248, "learning_rate": 7.638003975737805e-05, "loss": 0.9833, "step": 5035 }, { "epoch": 0.2517482517482518, "grad_norm": 1.909712553024292, "learning_rate": 7.635455425862684e-05, "loss": 1.0739, "step": 5040 }, { "epoch": 0.251998001998002, "grad_norm": 1.9120919704437256, "learning_rate": 7.632906875987564e-05, "loss": 1.0924, "step": 5045 }, { "epoch": 0.25224775224775225, "grad_norm": 1.9310723543167114, "learning_rate": 7.630358326112442e-05, "loss": 0.9736, "step": 5050 }, { "epoch": 0.2524975024975025, "grad_norm": 2.1766226291656494, "learning_rate": 7.627809776237322e-05, "loss": 1.035, "step": 5055 }, { "epoch": 0.25274725274725274, "grad_norm": 2.1145689487457275, "learning_rate": 7.6252612263622e-05, "loss": 0.9864, "step": 5060 }, { "epoch": 0.252997002997003, "grad_norm": 2.0535898208618164, "learning_rate": 7.622712676487079e-05, "loss": 0.9398, "step": 5065 }, { "epoch": 0.2532467532467532, "grad_norm": 1.8851364850997925, "learning_rate": 7.620164126611959e-05, "loss": 1.0723, "step": 5070 }, { "epoch": 0.2534965034965035, "grad_norm": 1.51603364944458, "learning_rate": 7.617615576736837e-05, "loss": 0.9821, "step": 5075 }, { "epoch": 0.25374625374625376, "grad_norm": 1.7879979610443115, "learning_rate": 7.615067026861716e-05, "loss": 1.0686, "step": 5080 }, { "epoch": 0.25399600399600397, "grad_norm": 1.78134024143219, "learning_rate": 7.612518476986596e-05, "loss": 1.1269, "step": 5085 }, { "epoch": 0.25424575424575424, "grad_norm": 2.2491114139556885, "learning_rate": 7.609969927111474e-05, "loss": 0.9465, "step": 5090 }, { "epoch": 0.2544955044955045, "grad_norm": 3.1511194705963135, "learning_rate": 7.607421377236353e-05, "loss": 1.1614, "step": 5095 }, { "epoch": 0.2547452547452547, "grad_norm": 2.0767757892608643, "learning_rate": 7.604872827361232e-05, "loss": 1.0735, "step": 5100 }, { "epoch": 0.254995004995005, "grad_norm": 1.636931300163269, "learning_rate": 7.60232427748611e-05, "loss": 1.1191, "step": 5105 }, { "epoch": 0.25524475524475526, "grad_norm": 2.3691956996917725, "learning_rate": 7.59977572761099e-05, "loss": 1.1014, "step": 5110 }, { "epoch": 0.2554945054945055, "grad_norm": 2.1244609355926514, "learning_rate": 7.597227177735869e-05, "loss": 1.0655, "step": 5115 }, { "epoch": 0.25574425574425574, "grad_norm": 1.8507598638534546, "learning_rate": 7.594678627860747e-05, "loss": 0.9298, "step": 5120 }, { "epoch": 0.255994005994006, "grad_norm": 1.9682705402374268, "learning_rate": 7.592130077985627e-05, "loss": 0.9467, "step": 5125 }, { "epoch": 0.2562437562437562, "grad_norm": 2.3301291465759277, "learning_rate": 7.589581528110505e-05, "loss": 1.0195, "step": 5130 }, { "epoch": 0.2564935064935065, "grad_norm": 2.639894962310791, "learning_rate": 7.587032978235384e-05, "loss": 1.031, "step": 5135 }, { "epoch": 0.25674325674325676, "grad_norm": 1.8825178146362305, "learning_rate": 7.584484428360264e-05, "loss": 1.1273, "step": 5140 }, { "epoch": 0.256993006993007, "grad_norm": 2.4721477031707764, "learning_rate": 7.581935878485142e-05, "loss": 0.9829, "step": 5145 }, { "epoch": 0.25724275724275725, "grad_norm": 2.0210976600646973, "learning_rate": 7.579387328610021e-05, "loss": 1.1514, "step": 5150 }, { "epoch": 0.2574925074925075, "grad_norm": 2.043745279312134, "learning_rate": 7.576838778734901e-05, "loss": 1.0593, "step": 5155 }, { "epoch": 0.25774225774225773, "grad_norm": 1.7326202392578125, "learning_rate": 7.57429022885978e-05, "loss": 0.9861, "step": 5160 }, { "epoch": 0.257992007992008, "grad_norm": 1.6216927766799927, "learning_rate": 7.571741678984658e-05, "loss": 1.093, "step": 5165 }, { "epoch": 0.25824175824175827, "grad_norm": 1.9901914596557617, "learning_rate": 7.569193129109538e-05, "loss": 0.9432, "step": 5170 }, { "epoch": 0.2584915084915085, "grad_norm": 1.7844946384429932, "learning_rate": 7.566644579234416e-05, "loss": 0.9183, "step": 5175 }, { "epoch": 0.25874125874125875, "grad_norm": 1.9742131233215332, "learning_rate": 7.564096029359295e-05, "loss": 1.0654, "step": 5180 }, { "epoch": 0.25899100899100896, "grad_norm": 2.4195590019226074, "learning_rate": 7.561547479484174e-05, "loss": 1.0047, "step": 5185 }, { "epoch": 0.25924075924075923, "grad_norm": 1.6503692865371704, "learning_rate": 7.558998929609052e-05, "loss": 0.9936, "step": 5190 }, { "epoch": 0.2594905094905095, "grad_norm": 2.2571916580200195, "learning_rate": 7.556450379733932e-05, "loss": 1.0331, "step": 5195 }, { "epoch": 0.2597402597402597, "grad_norm": 3.0033440589904785, "learning_rate": 7.55390182985881e-05, "loss": 1.0749, "step": 5200 }, { "epoch": 0.25999000999001, "grad_norm": 1.8866443634033203, "learning_rate": 7.551353279983689e-05, "loss": 1.0868, "step": 5205 }, { "epoch": 0.26023976023976025, "grad_norm": 2.1566002368927, "learning_rate": 7.548804730108569e-05, "loss": 0.9878, "step": 5210 }, { "epoch": 0.26048951048951047, "grad_norm": 1.9262076616287231, "learning_rate": 7.546256180233447e-05, "loss": 1.0331, "step": 5215 }, { "epoch": 0.26073926073926074, "grad_norm": 2.203005313873291, "learning_rate": 7.543707630358326e-05, "loss": 1.0374, "step": 5220 }, { "epoch": 0.260989010989011, "grad_norm": 1.7180277109146118, "learning_rate": 7.541159080483206e-05, "loss": 0.9755, "step": 5225 }, { "epoch": 0.2612387612387612, "grad_norm": 1.7348897457122803, "learning_rate": 7.538610530608084e-05, "loss": 1.0327, "step": 5230 }, { "epoch": 0.2614885114885115, "grad_norm": 1.949957251548767, "learning_rate": 7.536061980732963e-05, "loss": 0.9472, "step": 5235 }, { "epoch": 0.26173826173826176, "grad_norm": 2.03574800491333, "learning_rate": 7.533513430857843e-05, "loss": 1.0126, "step": 5240 }, { "epoch": 0.26198801198801197, "grad_norm": 2.1787068843841553, "learning_rate": 7.530964880982721e-05, "loss": 1.0223, "step": 5245 }, { "epoch": 0.26223776223776224, "grad_norm": 1.9440627098083496, "learning_rate": 7.5284163311076e-05, "loss": 0.9588, "step": 5250 }, { "epoch": 0.2624875124875125, "grad_norm": 1.6939008235931396, "learning_rate": 7.52586778123248e-05, "loss": 0.935, "step": 5255 }, { "epoch": 0.2627372627372627, "grad_norm": 1.8697701692581177, "learning_rate": 7.523319231357357e-05, "loss": 0.9654, "step": 5260 }, { "epoch": 0.262987012987013, "grad_norm": 1.8214843273162842, "learning_rate": 7.520770681482237e-05, "loss": 1.1219, "step": 5265 }, { "epoch": 0.26323676323676326, "grad_norm": 1.711678385734558, "learning_rate": 7.518222131607115e-05, "loss": 1.0181, "step": 5270 }, { "epoch": 0.2634865134865135, "grad_norm": 1.6236392259597778, "learning_rate": 7.515673581731994e-05, "loss": 1.0338, "step": 5275 }, { "epoch": 0.26373626373626374, "grad_norm": 1.522229790687561, "learning_rate": 7.513125031856874e-05, "loss": 1.0727, "step": 5280 }, { "epoch": 0.263986013986014, "grad_norm": 2.18021297454834, "learning_rate": 7.510576481981752e-05, "loss": 1.0697, "step": 5285 }, { "epoch": 0.2642357642357642, "grad_norm": 1.9739171266555786, "learning_rate": 7.508027932106631e-05, "loss": 1.0067, "step": 5290 }, { "epoch": 0.2644855144855145, "grad_norm": 1.7759761810302734, "learning_rate": 7.505479382231511e-05, "loss": 1.0291, "step": 5295 }, { "epoch": 0.2647352647352647, "grad_norm": 1.9466276168823242, "learning_rate": 7.50293083235639e-05, "loss": 1.1048, "step": 5300 }, { "epoch": 0.264985014985015, "grad_norm": 2.7473809719085693, "learning_rate": 7.500382282481268e-05, "loss": 1.0468, "step": 5305 }, { "epoch": 0.26523476523476525, "grad_norm": 1.8070800304412842, "learning_rate": 7.497833732606148e-05, "loss": 1.0808, "step": 5310 }, { "epoch": 0.26548451548451546, "grad_norm": 1.9270546436309814, "learning_rate": 7.495285182731026e-05, "loss": 1.0423, "step": 5315 }, { "epoch": 0.26573426573426573, "grad_norm": 1.5912681818008423, "learning_rate": 7.492736632855906e-05, "loss": 1.0321, "step": 5320 }, { "epoch": 0.265984015984016, "grad_norm": 1.8720779418945312, "learning_rate": 7.490188082980785e-05, "loss": 1.0024, "step": 5325 }, { "epoch": 0.2662337662337662, "grad_norm": 2.2314369678497314, "learning_rate": 7.487639533105663e-05, "loss": 0.988, "step": 5330 }, { "epoch": 0.2664835164835165, "grad_norm": 1.9692296981811523, "learning_rate": 7.485090983230543e-05, "loss": 1.1147, "step": 5335 }, { "epoch": 0.26673326673326675, "grad_norm": 1.6623531579971313, "learning_rate": 7.48254243335542e-05, "loss": 1.0815, "step": 5340 }, { "epoch": 0.26698301698301696, "grad_norm": 1.698994517326355, "learning_rate": 7.479993883480299e-05, "loss": 1.0425, "step": 5345 }, { "epoch": 0.26723276723276723, "grad_norm": 2.0368902683258057, "learning_rate": 7.477445333605179e-05, "loss": 1.0934, "step": 5350 }, { "epoch": 0.2674825174825175, "grad_norm": 3.4019775390625, "learning_rate": 7.474896783730057e-05, "loss": 1.1363, "step": 5355 }, { "epoch": 0.2677322677322677, "grad_norm": 1.6450436115264893, "learning_rate": 7.472348233854936e-05, "loss": 1.0003, "step": 5360 }, { "epoch": 0.267982017982018, "grad_norm": 1.9382550716400146, "learning_rate": 7.469799683979816e-05, "loss": 1.0381, "step": 5365 }, { "epoch": 0.26823176823176825, "grad_norm": 1.9327597618103027, "learning_rate": 7.467251134104694e-05, "loss": 1.0519, "step": 5370 }, { "epoch": 0.26848151848151847, "grad_norm": 1.8713724613189697, "learning_rate": 7.464702584229574e-05, "loss": 1.0288, "step": 5375 }, { "epoch": 0.26873126873126874, "grad_norm": 1.7281807661056519, "learning_rate": 7.462154034354453e-05, "loss": 0.9987, "step": 5380 }, { "epoch": 0.268981018981019, "grad_norm": 1.8159165382385254, "learning_rate": 7.459605484479331e-05, "loss": 0.9291, "step": 5385 }, { "epoch": 0.2692307692307692, "grad_norm": 2.5080490112304688, "learning_rate": 7.457056934604211e-05, "loss": 1.0121, "step": 5390 }, { "epoch": 0.2694805194805195, "grad_norm": 2.0519590377807617, "learning_rate": 7.45450838472909e-05, "loss": 1.0456, "step": 5395 }, { "epoch": 0.26973026973026976, "grad_norm": 2.2975635528564453, "learning_rate": 7.451959834853968e-05, "loss": 1.1752, "step": 5400 }, { "epoch": 0.26998001998001997, "grad_norm": 1.7583258152008057, "learning_rate": 7.449411284978848e-05, "loss": 1.0775, "step": 5405 }, { "epoch": 0.27022977022977024, "grad_norm": 2.157249927520752, "learning_rate": 7.446862735103727e-05, "loss": 1.0716, "step": 5410 }, { "epoch": 0.27047952047952045, "grad_norm": 1.6647003889083862, "learning_rate": 7.444314185228605e-05, "loss": 0.9735, "step": 5415 }, { "epoch": 0.2707292707292707, "grad_norm": 3.4810123443603516, "learning_rate": 7.441765635353484e-05, "loss": 1.1215, "step": 5420 }, { "epoch": 0.270979020979021, "grad_norm": 1.576505184173584, "learning_rate": 7.439217085478362e-05, "loss": 1.0822, "step": 5425 }, { "epoch": 0.2712287712287712, "grad_norm": 1.8655037879943848, "learning_rate": 7.436668535603242e-05, "loss": 1.0422, "step": 5430 }, { "epoch": 0.2714785214785215, "grad_norm": 1.8829668760299683, "learning_rate": 7.434119985728121e-05, "loss": 1.0946, "step": 5435 }, { "epoch": 0.27172827172827174, "grad_norm": 1.8848007917404175, "learning_rate": 7.431571435853e-05, "loss": 1.0673, "step": 5440 }, { "epoch": 0.27197802197802196, "grad_norm": 1.6314436197280884, "learning_rate": 7.429022885977879e-05, "loss": 1.069, "step": 5445 }, { "epoch": 0.2722277722277722, "grad_norm": 2.000218152999878, "learning_rate": 7.426474336102758e-05, "loss": 1.0356, "step": 5450 }, { "epoch": 0.2724775224775225, "grad_norm": 1.764844298362732, "learning_rate": 7.423925786227636e-05, "loss": 1.0769, "step": 5455 }, { "epoch": 0.2727272727272727, "grad_norm": 1.6560906171798706, "learning_rate": 7.421377236352516e-05, "loss": 1.008, "step": 5460 }, { "epoch": 0.272977022977023, "grad_norm": 1.607142686843872, "learning_rate": 7.418828686477395e-05, "loss": 0.9952, "step": 5465 }, { "epoch": 0.27322677322677325, "grad_norm": 2.1233675479888916, "learning_rate": 7.416280136602273e-05, "loss": 1.0279, "step": 5470 }, { "epoch": 0.27347652347652346, "grad_norm": 2.160606861114502, "learning_rate": 7.413731586727153e-05, "loss": 0.961, "step": 5475 }, { "epoch": 0.27372627372627373, "grad_norm": 1.9707294702529907, "learning_rate": 7.411183036852032e-05, "loss": 1.0477, "step": 5480 }, { "epoch": 0.273976023976024, "grad_norm": 2.795624256134033, "learning_rate": 7.40863448697691e-05, "loss": 1.0124, "step": 5485 }, { "epoch": 0.2742257742257742, "grad_norm": 2.7478513717651367, "learning_rate": 7.40608593710179e-05, "loss": 1.0229, "step": 5490 }, { "epoch": 0.2744755244755245, "grad_norm": 1.9767814874649048, "learning_rate": 7.403537387226669e-05, "loss": 0.9877, "step": 5495 }, { "epoch": 0.27472527472527475, "grad_norm": 1.8293638229370117, "learning_rate": 7.400988837351547e-05, "loss": 1.0486, "step": 5500 }, { "epoch": 0.27497502497502496, "grad_norm": 1.9815032482147217, "learning_rate": 7.398440287476426e-05, "loss": 0.9634, "step": 5505 }, { "epoch": 0.27522477522477523, "grad_norm": 1.5540897846221924, "learning_rate": 7.395891737601304e-05, "loss": 1.1281, "step": 5510 }, { "epoch": 0.2754745254745255, "grad_norm": 2.109125852584839, "learning_rate": 7.393343187726184e-05, "loss": 0.9516, "step": 5515 }, { "epoch": 0.2757242757242757, "grad_norm": 1.8000074625015259, "learning_rate": 7.390794637851063e-05, "loss": 0.9981, "step": 5520 }, { "epoch": 0.275974025974026, "grad_norm": 2.064631938934326, "learning_rate": 7.388246087975941e-05, "loss": 1.0303, "step": 5525 }, { "epoch": 0.2762237762237762, "grad_norm": 1.8636445999145508, "learning_rate": 7.385697538100821e-05, "loss": 1.1369, "step": 5530 }, { "epoch": 0.27647352647352647, "grad_norm": 1.488811731338501, "learning_rate": 7.3831489882257e-05, "loss": 1.1089, "step": 5535 }, { "epoch": 0.27672327672327673, "grad_norm": 2.3022987842559814, "learning_rate": 7.380600438350578e-05, "loss": 1.1134, "step": 5540 }, { "epoch": 0.27697302697302695, "grad_norm": 2.1971514225006104, "learning_rate": 7.378051888475458e-05, "loss": 1.0762, "step": 5545 }, { "epoch": 0.2772227772227772, "grad_norm": 1.551557183265686, "learning_rate": 7.375503338600337e-05, "loss": 0.9449, "step": 5550 }, { "epoch": 0.2774725274725275, "grad_norm": 2.0641748905181885, "learning_rate": 7.372954788725215e-05, "loss": 0.9562, "step": 5555 }, { "epoch": 0.2777222777222777, "grad_norm": 2.6412525177001953, "learning_rate": 7.370406238850095e-05, "loss": 0.9262, "step": 5560 }, { "epoch": 0.27797202797202797, "grad_norm": 2.7439937591552734, "learning_rate": 7.367857688974974e-05, "loss": 1.0081, "step": 5565 }, { "epoch": 0.27822177822177824, "grad_norm": 3.238938808441162, "learning_rate": 7.365309139099854e-05, "loss": 1.0355, "step": 5570 }, { "epoch": 0.27847152847152845, "grad_norm": 2.005143165588379, "learning_rate": 7.362760589224732e-05, "loss": 1.0254, "step": 5575 }, { "epoch": 0.2787212787212787, "grad_norm": 1.959616780281067, "learning_rate": 7.36021203934961e-05, "loss": 0.9845, "step": 5580 }, { "epoch": 0.278971028971029, "grad_norm": 1.6460177898406982, "learning_rate": 7.357663489474489e-05, "loss": 0.9485, "step": 5585 }, { "epoch": 0.2792207792207792, "grad_norm": 1.9786043167114258, "learning_rate": 7.355114939599368e-05, "loss": 1.0086, "step": 5590 }, { "epoch": 0.27947052947052947, "grad_norm": 1.6268415451049805, "learning_rate": 7.352566389724246e-05, "loss": 0.9772, "step": 5595 }, { "epoch": 0.27972027972027974, "grad_norm": 2.050100088119507, "learning_rate": 7.350017839849126e-05, "loss": 1.064, "step": 5600 }, { "epoch": 0.27997002997002995, "grad_norm": 1.5007853507995605, "learning_rate": 7.347469289974005e-05, "loss": 1.0911, "step": 5605 }, { "epoch": 0.2802197802197802, "grad_norm": 2.8260931968688965, "learning_rate": 7.344920740098883e-05, "loss": 1.0634, "step": 5610 }, { "epoch": 0.2804695304695305, "grad_norm": 2.015601634979248, "learning_rate": 7.342372190223763e-05, "loss": 1.0771, "step": 5615 }, { "epoch": 0.2807192807192807, "grad_norm": 3.665992021560669, "learning_rate": 7.339823640348642e-05, "loss": 0.9208, "step": 5620 }, { "epoch": 0.280969030969031, "grad_norm": 2.166318655014038, "learning_rate": 7.337275090473522e-05, "loss": 1.1727, "step": 5625 }, { "epoch": 0.28121878121878124, "grad_norm": 2.0064218044281006, "learning_rate": 7.3347265405984e-05, "loss": 1.0143, "step": 5630 }, { "epoch": 0.28146853146853146, "grad_norm": 1.905432105064392, "learning_rate": 7.332177990723279e-05, "loss": 1.0042, "step": 5635 }, { "epoch": 0.2817182817182817, "grad_norm": 1.7741644382476807, "learning_rate": 7.329629440848159e-05, "loss": 1.0313, "step": 5640 }, { "epoch": 0.281968031968032, "grad_norm": 2.085387945175171, "learning_rate": 7.327080890973037e-05, "loss": 1.0904, "step": 5645 }, { "epoch": 0.2822177822177822, "grad_norm": 3.115861654281616, "learning_rate": 7.324532341097916e-05, "loss": 1.0479, "step": 5650 }, { "epoch": 0.2824675324675325, "grad_norm": 3.044856548309326, "learning_rate": 7.321983791222796e-05, "loss": 1.0497, "step": 5655 }, { "epoch": 0.2827172827172827, "grad_norm": 1.6150795221328735, "learning_rate": 7.319435241347673e-05, "loss": 1.0153, "step": 5660 }, { "epoch": 0.28296703296703296, "grad_norm": 1.9379756450653076, "learning_rate": 7.316886691472551e-05, "loss": 1.0951, "step": 5665 }, { "epoch": 0.28321678321678323, "grad_norm": 1.7692570686340332, "learning_rate": 7.314338141597431e-05, "loss": 1.0737, "step": 5670 }, { "epoch": 0.28346653346653344, "grad_norm": 1.6927388906478882, "learning_rate": 7.31178959172231e-05, "loss": 0.9852, "step": 5675 }, { "epoch": 0.2837162837162837, "grad_norm": 1.9092258214950562, "learning_rate": 7.30924104184719e-05, "loss": 1.1251, "step": 5680 }, { "epoch": 0.283966033966034, "grad_norm": 1.8926985263824463, "learning_rate": 7.306692491972068e-05, "loss": 1.0817, "step": 5685 }, { "epoch": 0.2842157842157842, "grad_norm": 1.7376550436019897, "learning_rate": 7.304143942096947e-05, "loss": 1.0119, "step": 5690 }, { "epoch": 0.28446553446553446, "grad_norm": 1.6109119653701782, "learning_rate": 7.301595392221827e-05, "loss": 0.905, "step": 5695 }, { "epoch": 0.28471528471528473, "grad_norm": 1.5279611349105835, "learning_rate": 7.299046842346705e-05, "loss": 1.0019, "step": 5700 }, { "epoch": 0.28496503496503495, "grad_norm": 1.546050786972046, "learning_rate": 7.296498292471584e-05, "loss": 1.0421, "step": 5705 }, { "epoch": 0.2852147852147852, "grad_norm": 2.1550872325897217, "learning_rate": 7.293949742596464e-05, "loss": 1.0892, "step": 5710 }, { "epoch": 0.2854645354645355, "grad_norm": 2.823500394821167, "learning_rate": 7.291401192721342e-05, "loss": 1.0521, "step": 5715 }, { "epoch": 0.2857142857142857, "grad_norm": 2.96592116355896, "learning_rate": 7.288852642846221e-05, "loss": 0.9684, "step": 5720 }, { "epoch": 0.28596403596403597, "grad_norm": 1.734725832939148, "learning_rate": 7.2863040929711e-05, "loss": 0.9796, "step": 5725 }, { "epoch": 0.28621378621378624, "grad_norm": 1.9991103410720825, "learning_rate": 7.283755543095979e-05, "loss": 0.9711, "step": 5730 }, { "epoch": 0.28646353646353645, "grad_norm": 1.937422275543213, "learning_rate": 7.281206993220858e-05, "loss": 1.0239, "step": 5735 }, { "epoch": 0.2867132867132867, "grad_norm": 1.6919355392456055, "learning_rate": 7.278658443345736e-05, "loss": 1.0645, "step": 5740 }, { "epoch": 0.286963036963037, "grad_norm": 1.6486693620681763, "learning_rate": 7.276109893470615e-05, "loss": 1.0451, "step": 5745 }, { "epoch": 0.2872127872127872, "grad_norm": 1.7411932945251465, "learning_rate": 7.273561343595495e-05, "loss": 1.0562, "step": 5750 }, { "epoch": 0.28746253746253747, "grad_norm": 1.4536148309707642, "learning_rate": 7.271012793720373e-05, "loss": 0.9321, "step": 5755 }, { "epoch": 0.28771228771228774, "grad_norm": 1.8457754850387573, "learning_rate": 7.268464243845252e-05, "loss": 1.0379, "step": 5760 }, { "epoch": 0.28796203796203795, "grad_norm": 1.7738784551620483, "learning_rate": 7.265915693970132e-05, "loss": 1.0142, "step": 5765 }, { "epoch": 0.2882117882117882, "grad_norm": 1.7353675365447998, "learning_rate": 7.26336714409501e-05, "loss": 1.0573, "step": 5770 }, { "epoch": 0.28846153846153844, "grad_norm": 1.8078935146331787, "learning_rate": 7.260818594219889e-05, "loss": 1.0508, "step": 5775 }, { "epoch": 0.2887112887112887, "grad_norm": 2.184950113296509, "learning_rate": 7.258270044344769e-05, "loss": 1.0062, "step": 5780 }, { "epoch": 0.288961038961039, "grad_norm": 2.1623973846435547, "learning_rate": 7.255721494469647e-05, "loss": 0.9655, "step": 5785 }, { "epoch": 0.2892107892107892, "grad_norm": 1.5844616889953613, "learning_rate": 7.253172944594526e-05, "loss": 0.9433, "step": 5790 }, { "epoch": 0.28946053946053946, "grad_norm": 1.913388729095459, "learning_rate": 7.250624394719406e-05, "loss": 1.0823, "step": 5795 }, { "epoch": 0.2897102897102897, "grad_norm": 2.031667470932007, "learning_rate": 7.248075844844284e-05, "loss": 1.0468, "step": 5800 }, { "epoch": 0.28996003996003994, "grad_norm": 1.8511956930160522, "learning_rate": 7.245527294969163e-05, "loss": 0.9667, "step": 5805 }, { "epoch": 0.2902097902097902, "grad_norm": 1.668877363204956, "learning_rate": 7.242978745094043e-05, "loss": 1.0686, "step": 5810 }, { "epoch": 0.2904595404595405, "grad_norm": 1.8170700073242188, "learning_rate": 7.240430195218921e-05, "loss": 1.0386, "step": 5815 }, { "epoch": 0.2907092907092907, "grad_norm": 2.108354091644287, "learning_rate": 7.2378816453438e-05, "loss": 0.9612, "step": 5820 }, { "epoch": 0.29095904095904096, "grad_norm": 1.833757758140564, "learning_rate": 7.235333095468678e-05, "loss": 1.1132, "step": 5825 }, { "epoch": 0.29120879120879123, "grad_norm": 1.9576832056045532, "learning_rate": 7.232784545593557e-05, "loss": 1.0448, "step": 5830 }, { "epoch": 0.29145854145854144, "grad_norm": 2.100254535675049, "learning_rate": 7.230235995718437e-05, "loss": 1.0776, "step": 5835 }, { "epoch": 0.2917082917082917, "grad_norm": 1.9296543598175049, "learning_rate": 7.227687445843315e-05, "loss": 1.0881, "step": 5840 }, { "epoch": 0.291958041958042, "grad_norm": 1.5316691398620605, "learning_rate": 7.225138895968194e-05, "loss": 0.987, "step": 5845 }, { "epoch": 0.2922077922077922, "grad_norm": 1.8960007429122925, "learning_rate": 7.222590346093074e-05, "loss": 0.9518, "step": 5850 }, { "epoch": 0.29245754245754246, "grad_norm": 1.7871769666671753, "learning_rate": 7.220041796217952e-05, "loss": 1.0461, "step": 5855 }, { "epoch": 0.29270729270729273, "grad_norm": 1.5789884328842163, "learning_rate": 7.217493246342831e-05, "loss": 0.9613, "step": 5860 }, { "epoch": 0.29295704295704295, "grad_norm": 1.6548566818237305, "learning_rate": 7.21494469646771e-05, "loss": 0.9499, "step": 5865 }, { "epoch": 0.2932067932067932, "grad_norm": 2.0735507011413574, "learning_rate": 7.212396146592589e-05, "loss": 1.0558, "step": 5870 }, { "epoch": 0.2934565434565435, "grad_norm": 2.2409348487854004, "learning_rate": 7.209847596717468e-05, "loss": 1.0564, "step": 5875 }, { "epoch": 0.2937062937062937, "grad_norm": 2.2286741733551025, "learning_rate": 7.207299046842348e-05, "loss": 1.0965, "step": 5880 }, { "epoch": 0.29395604395604397, "grad_norm": 2.0183796882629395, "learning_rate": 7.204750496967226e-05, "loss": 1.0991, "step": 5885 }, { "epoch": 0.2942057942057942, "grad_norm": 2.0117483139038086, "learning_rate": 7.202201947092106e-05, "loss": 0.9754, "step": 5890 }, { "epoch": 0.29445554445554445, "grad_norm": 2.07564115524292, "learning_rate": 7.199653397216985e-05, "loss": 1.1217, "step": 5895 }, { "epoch": 0.2947052947052947, "grad_norm": 2.122250556945801, "learning_rate": 7.197104847341862e-05, "loss": 1.2117, "step": 5900 }, { "epoch": 0.29495504495504493, "grad_norm": 1.5650967359542847, "learning_rate": 7.194556297466742e-05, "loss": 1.1164, "step": 5905 }, { "epoch": 0.2952047952047952, "grad_norm": 1.8270782232284546, "learning_rate": 7.19200774759162e-05, "loss": 1.0701, "step": 5910 }, { "epoch": 0.29545454545454547, "grad_norm": 1.9122847318649292, "learning_rate": 7.189459197716499e-05, "loss": 1.0467, "step": 5915 }, { "epoch": 0.2957042957042957, "grad_norm": 1.647701382637024, "learning_rate": 7.186910647841379e-05, "loss": 1.0663, "step": 5920 }, { "epoch": 0.29595404595404595, "grad_norm": 2.0857555866241455, "learning_rate": 7.184362097966257e-05, "loss": 1.0536, "step": 5925 }, { "epoch": 0.2962037962037962, "grad_norm": 1.846787452697754, "learning_rate": 7.181813548091137e-05, "loss": 0.9747, "step": 5930 }, { "epoch": 0.29645354645354643, "grad_norm": 1.6002603769302368, "learning_rate": 7.179264998216016e-05, "loss": 1.0288, "step": 5935 }, { "epoch": 0.2967032967032967, "grad_norm": 1.9411011934280396, "learning_rate": 7.176716448340894e-05, "loss": 1.0821, "step": 5940 }, { "epoch": 0.296953046953047, "grad_norm": 1.4743974208831787, "learning_rate": 7.174167898465774e-05, "loss": 1.0181, "step": 5945 }, { "epoch": 0.2972027972027972, "grad_norm": 2.9584906101226807, "learning_rate": 7.171619348590653e-05, "loss": 1.0832, "step": 5950 }, { "epoch": 0.29745254745254746, "grad_norm": 2.0453147888183594, "learning_rate": 7.169070798715531e-05, "loss": 0.9718, "step": 5955 }, { "epoch": 0.2977022977022977, "grad_norm": 2.440640449523926, "learning_rate": 7.166522248840411e-05, "loss": 1.0309, "step": 5960 }, { "epoch": 0.29795204795204794, "grad_norm": 1.728193759918213, "learning_rate": 7.16397369896529e-05, "loss": 1.0024, "step": 5965 }, { "epoch": 0.2982017982017982, "grad_norm": 1.6844053268432617, "learning_rate": 7.161425149090168e-05, "loss": 1.1094, "step": 5970 }, { "epoch": 0.2984515484515485, "grad_norm": 1.6607909202575684, "learning_rate": 7.158876599215048e-05, "loss": 1.0416, "step": 5975 }, { "epoch": 0.2987012987012987, "grad_norm": 1.6083799600601196, "learning_rate": 7.156328049339925e-05, "loss": 0.9759, "step": 5980 }, { "epoch": 0.29895104895104896, "grad_norm": 1.6488494873046875, "learning_rate": 7.153779499464805e-05, "loss": 1.009, "step": 5985 }, { "epoch": 0.29920079920079923, "grad_norm": 1.7905969619750977, "learning_rate": 7.151230949589684e-05, "loss": 0.9569, "step": 5990 }, { "epoch": 0.29945054945054944, "grad_norm": 1.398938775062561, "learning_rate": 7.148682399714562e-05, "loss": 1.027, "step": 5995 }, { "epoch": 0.2997002997002997, "grad_norm": 2.010087728500366, "learning_rate": 7.146133849839442e-05, "loss": 1.0317, "step": 6000 }, { "epoch": 0.2999500499500499, "grad_norm": 2.389011859893799, "learning_rate": 7.14358529996432e-05, "loss": 1.1811, "step": 6005 }, { "epoch": 0.3001998001998002, "grad_norm": 1.8682610988616943, "learning_rate": 7.141036750089199e-05, "loss": 1.1109, "step": 6010 }, { "epoch": 0.30044955044955046, "grad_norm": 1.5816984176635742, "learning_rate": 7.138488200214079e-05, "loss": 0.9946, "step": 6015 }, { "epoch": 0.3006993006993007, "grad_norm": 1.7521203756332397, "learning_rate": 7.135939650338958e-05, "loss": 1.0106, "step": 6020 }, { "epoch": 0.30094905094905094, "grad_norm": 2.072221517562866, "learning_rate": 7.133391100463836e-05, "loss": 1.0189, "step": 6025 }, { "epoch": 0.3011988011988012, "grad_norm": 1.6137669086456299, "learning_rate": 7.130842550588716e-05, "loss": 1.0816, "step": 6030 }, { "epoch": 0.3014485514485514, "grad_norm": 1.9235141277313232, "learning_rate": 7.128294000713595e-05, "loss": 1.0119, "step": 6035 }, { "epoch": 0.3016983016983017, "grad_norm": 2.259138345718384, "learning_rate": 7.125745450838473e-05, "loss": 1.0372, "step": 6040 }, { "epoch": 0.30194805194805197, "grad_norm": 1.8913726806640625, "learning_rate": 7.123196900963353e-05, "loss": 1.021, "step": 6045 }, { "epoch": 0.3021978021978022, "grad_norm": 2.602698564529419, "learning_rate": 7.120648351088231e-05, "loss": 1.022, "step": 6050 }, { "epoch": 0.30244755244755245, "grad_norm": 1.8708970546722412, "learning_rate": 7.11809980121311e-05, "loss": 1.0388, "step": 6055 }, { "epoch": 0.3026973026973027, "grad_norm": 1.6032131910324097, "learning_rate": 7.115551251337989e-05, "loss": 1.1297, "step": 6060 }, { "epoch": 0.30294705294705293, "grad_norm": 1.842826247215271, "learning_rate": 7.113002701462867e-05, "loss": 1.0807, "step": 6065 }, { "epoch": 0.3031968031968032, "grad_norm": 1.7373815774917603, "learning_rate": 7.110454151587747e-05, "loss": 1.0787, "step": 6070 }, { "epoch": 0.30344655344655347, "grad_norm": 2.1008505821228027, "learning_rate": 7.107905601712626e-05, "loss": 0.9743, "step": 6075 }, { "epoch": 0.3036963036963037, "grad_norm": 1.8074133396148682, "learning_rate": 7.105357051837504e-05, "loss": 1.0477, "step": 6080 }, { "epoch": 0.30394605394605395, "grad_norm": 3.1432957649230957, "learning_rate": 7.102808501962384e-05, "loss": 0.9808, "step": 6085 }, { "epoch": 0.3041958041958042, "grad_norm": 1.967804193496704, "learning_rate": 7.100259952087263e-05, "loss": 1.1995, "step": 6090 }, { "epoch": 0.30444555444555443, "grad_norm": 2.9563050270080566, "learning_rate": 7.097711402212141e-05, "loss": 1.0551, "step": 6095 }, { "epoch": 0.3046953046953047, "grad_norm": 1.7698121070861816, "learning_rate": 7.095162852337021e-05, "loss": 1.109, "step": 6100 }, { "epoch": 0.30494505494505497, "grad_norm": 1.614898920059204, "learning_rate": 7.0926143024619e-05, "loss": 1.0961, "step": 6105 }, { "epoch": 0.3051948051948052, "grad_norm": 1.8398268222808838, "learning_rate": 7.090065752586778e-05, "loss": 1.0703, "step": 6110 }, { "epoch": 0.30544455544455545, "grad_norm": 1.5147804021835327, "learning_rate": 7.087517202711658e-05, "loss": 0.9505, "step": 6115 }, { "epoch": 0.30569430569430567, "grad_norm": 2.228177785873413, "learning_rate": 7.084968652836536e-05, "loss": 1.0268, "step": 6120 }, { "epoch": 0.30594405594405594, "grad_norm": 1.8606116771697998, "learning_rate": 7.082420102961415e-05, "loss": 0.972, "step": 6125 }, { "epoch": 0.3061938061938062, "grad_norm": 1.9711462259292603, "learning_rate": 7.079871553086295e-05, "loss": 1.0171, "step": 6130 }, { "epoch": 0.3064435564435564, "grad_norm": 2.798313617706299, "learning_rate": 7.077323003211173e-05, "loss": 0.9945, "step": 6135 }, { "epoch": 0.3066933066933067, "grad_norm": 1.7483222484588623, "learning_rate": 7.074774453336052e-05, "loss": 1.0912, "step": 6140 }, { "epoch": 0.30694305694305696, "grad_norm": 1.7902244329452515, "learning_rate": 7.07222590346093e-05, "loss": 1.1045, "step": 6145 }, { "epoch": 0.30719280719280717, "grad_norm": 1.8761173486709595, "learning_rate": 7.069677353585809e-05, "loss": 1.0506, "step": 6150 }, { "epoch": 0.30744255744255744, "grad_norm": 1.8929243087768555, "learning_rate": 7.067128803710689e-05, "loss": 1.0585, "step": 6155 }, { "epoch": 0.3076923076923077, "grad_norm": 1.6574996709823608, "learning_rate": 7.064580253835568e-05, "loss": 1.0773, "step": 6160 }, { "epoch": 0.3079420579420579, "grad_norm": 2.1309285163879395, "learning_rate": 7.062031703960446e-05, "loss": 1.0458, "step": 6165 }, { "epoch": 0.3081918081918082, "grad_norm": 1.7124555110931396, "learning_rate": 7.059483154085326e-05, "loss": 1.0377, "step": 6170 }, { "epoch": 0.30844155844155846, "grad_norm": 2.0462229251861572, "learning_rate": 7.056934604210204e-05, "loss": 0.9745, "step": 6175 }, { "epoch": 0.3086913086913087, "grad_norm": 2.632260322570801, "learning_rate": 7.054386054335083e-05, "loss": 1.1181, "step": 6180 }, { "epoch": 0.30894105894105894, "grad_norm": 2.2896876335144043, "learning_rate": 7.051837504459963e-05, "loss": 0.9088, "step": 6185 }, { "epoch": 0.3091908091908092, "grad_norm": 2.415555238723755, "learning_rate": 7.049288954584841e-05, "loss": 1.076, "step": 6190 }, { "epoch": 0.3094405594405594, "grad_norm": 1.4302645921707153, "learning_rate": 7.046740404709721e-05, "loss": 1.0427, "step": 6195 }, { "epoch": 0.3096903096903097, "grad_norm": 1.7936162948608398, "learning_rate": 7.0441918548346e-05, "loss": 1.0603, "step": 6200 }, { "epoch": 0.30994005994005996, "grad_norm": 2.8622546195983887, "learning_rate": 7.041643304959478e-05, "loss": 1.0488, "step": 6205 }, { "epoch": 0.3101898101898102, "grad_norm": 2.0244789123535156, "learning_rate": 7.039094755084358e-05, "loss": 0.9636, "step": 6210 }, { "epoch": 0.31043956043956045, "grad_norm": 1.7083553075790405, "learning_rate": 7.036546205209237e-05, "loss": 1.0966, "step": 6215 }, { "epoch": 0.3106893106893107, "grad_norm": 1.7545779943466187, "learning_rate": 7.033997655334114e-05, "loss": 0.9715, "step": 6220 }, { "epoch": 0.31093906093906093, "grad_norm": 1.994056224822998, "learning_rate": 7.031449105458994e-05, "loss": 1.0615, "step": 6225 }, { "epoch": 0.3111888111888112, "grad_norm": 1.7080551385879517, "learning_rate": 7.028900555583873e-05, "loss": 0.9901, "step": 6230 }, { "epoch": 0.3114385614385614, "grad_norm": 1.3601038455963135, "learning_rate": 7.026352005708752e-05, "loss": 1.0993, "step": 6235 }, { "epoch": 0.3116883116883117, "grad_norm": 1.6991262435913086, "learning_rate": 7.023803455833631e-05, "loss": 1.043, "step": 6240 }, { "epoch": 0.31193806193806195, "grad_norm": 1.9205740690231323, "learning_rate": 7.02125490595851e-05, "loss": 1.0221, "step": 6245 }, { "epoch": 0.31218781218781216, "grad_norm": 2.0390241146087646, "learning_rate": 7.01870635608339e-05, "loss": 1.1273, "step": 6250 }, { "epoch": 0.31243756243756243, "grad_norm": 1.7392657995224, "learning_rate": 7.016157806208268e-05, "loss": 1.0204, "step": 6255 }, { "epoch": 0.3126873126873127, "grad_norm": 1.5373167991638184, "learning_rate": 7.013609256333146e-05, "loss": 1.1083, "step": 6260 }, { "epoch": 0.3129370629370629, "grad_norm": 1.71504545211792, "learning_rate": 7.011060706458026e-05, "loss": 1.0946, "step": 6265 }, { "epoch": 0.3131868131868132, "grad_norm": 1.5606290102005005, "learning_rate": 7.008512156582905e-05, "loss": 1.048, "step": 6270 }, { "epoch": 0.31343656343656345, "grad_norm": 3.052295207977295, "learning_rate": 7.005963606707783e-05, "loss": 1.0568, "step": 6275 }, { "epoch": 0.31368631368631367, "grad_norm": 1.5166233777999878, "learning_rate": 7.003415056832663e-05, "loss": 0.9882, "step": 6280 }, { "epoch": 0.31393606393606394, "grad_norm": 2.2646188735961914, "learning_rate": 7.000866506957542e-05, "loss": 1.0205, "step": 6285 }, { "epoch": 0.3141858141858142, "grad_norm": 2.3659980297088623, "learning_rate": 6.99831795708242e-05, "loss": 0.9612, "step": 6290 }, { "epoch": 0.3144355644355644, "grad_norm": 2.1659936904907227, "learning_rate": 6.9957694072073e-05, "loss": 1.1553, "step": 6295 }, { "epoch": 0.3146853146853147, "grad_norm": 2.2024145126342773, "learning_rate": 6.993220857332177e-05, "loss": 0.9555, "step": 6300 }, { "epoch": 0.31493506493506496, "grad_norm": 1.6150102615356445, "learning_rate": 6.990672307457057e-05, "loss": 0.9473, "step": 6305 }, { "epoch": 0.31518481518481517, "grad_norm": 1.9656704664230347, "learning_rate": 6.988123757581936e-05, "loss": 1.0985, "step": 6310 }, { "epoch": 0.31543456543456544, "grad_norm": 1.711432695388794, "learning_rate": 6.985575207706814e-05, "loss": 0.9879, "step": 6315 }, { "epoch": 0.3156843156843157, "grad_norm": 1.9529718160629272, "learning_rate": 6.983026657831694e-05, "loss": 1.0262, "step": 6320 }, { "epoch": 0.3159340659340659, "grad_norm": 1.9320037364959717, "learning_rate": 6.980478107956573e-05, "loss": 0.9979, "step": 6325 }, { "epoch": 0.3161838161838162, "grad_norm": 2.1962459087371826, "learning_rate": 6.977929558081451e-05, "loss": 0.9779, "step": 6330 }, { "epoch": 0.31643356643356646, "grad_norm": 1.8415571451187134, "learning_rate": 6.975381008206331e-05, "loss": 1.1324, "step": 6335 }, { "epoch": 0.3166833166833167, "grad_norm": 3.0879368782043457, "learning_rate": 6.97283245833121e-05, "loss": 1.0355, "step": 6340 }, { "epoch": 0.31693306693306694, "grad_norm": 3.443336248397827, "learning_rate": 6.970283908456088e-05, "loss": 1.1524, "step": 6345 }, { "epoch": 0.31718281718281716, "grad_norm": 1.5789637565612793, "learning_rate": 6.967735358580968e-05, "loss": 1.0304, "step": 6350 }, { "epoch": 0.3174325674325674, "grad_norm": 2.9197258949279785, "learning_rate": 6.965186808705847e-05, "loss": 1.1029, "step": 6355 }, { "epoch": 0.3176823176823177, "grad_norm": 1.7560954093933105, "learning_rate": 6.962638258830725e-05, "loss": 1.0705, "step": 6360 }, { "epoch": 0.3179320679320679, "grad_norm": 1.831823706626892, "learning_rate": 6.960089708955605e-05, "loss": 1.0465, "step": 6365 }, { "epoch": 0.3181818181818182, "grad_norm": 2.1826977729797363, "learning_rate": 6.957541159080484e-05, "loss": 1.0656, "step": 6370 }, { "epoch": 0.31843156843156845, "grad_norm": 2.012267827987671, "learning_rate": 6.954992609205362e-05, "loss": 1.1569, "step": 6375 }, { "epoch": 0.31868131868131866, "grad_norm": 2.257779359817505, "learning_rate": 6.952444059330241e-05, "loss": 1.036, "step": 6380 }, { "epoch": 0.31893106893106893, "grad_norm": 1.7664368152618408, "learning_rate": 6.94989550945512e-05, "loss": 1.0164, "step": 6385 }, { "epoch": 0.3191808191808192, "grad_norm": 2.0553927421569824, "learning_rate": 6.94734695958e-05, "loss": 1.0365, "step": 6390 }, { "epoch": 0.3194305694305694, "grad_norm": 1.6482720375061035, "learning_rate": 6.944798409704878e-05, "loss": 1.0931, "step": 6395 }, { "epoch": 0.3196803196803197, "grad_norm": 2.2288973331451416, "learning_rate": 6.942249859829756e-05, "loss": 1.112, "step": 6400 }, { "epoch": 0.31993006993006995, "grad_norm": 1.9116727113723755, "learning_rate": 6.939701309954636e-05, "loss": 1.029, "step": 6405 }, { "epoch": 0.32017982017982016, "grad_norm": 1.7353243827819824, "learning_rate": 6.937152760079515e-05, "loss": 1.0378, "step": 6410 }, { "epoch": 0.32042957042957043, "grad_norm": 1.7444422245025635, "learning_rate": 6.934604210204393e-05, "loss": 1.066, "step": 6415 }, { "epoch": 0.3206793206793207, "grad_norm": 1.8555166721343994, "learning_rate": 6.932055660329273e-05, "loss": 0.9505, "step": 6420 }, { "epoch": 0.3209290709290709, "grad_norm": 2.0973398685455322, "learning_rate": 6.929507110454152e-05, "loss": 0.9555, "step": 6425 }, { "epoch": 0.3211788211788212, "grad_norm": 1.5252944231033325, "learning_rate": 6.92695856057903e-05, "loss": 1.0069, "step": 6430 }, { "epoch": 0.32142857142857145, "grad_norm": 1.8916029930114746, "learning_rate": 6.92441001070391e-05, "loss": 1.1542, "step": 6435 }, { "epoch": 0.32167832167832167, "grad_norm": 1.817818522453308, "learning_rate": 6.921861460828789e-05, "loss": 1.096, "step": 6440 }, { "epoch": 0.32192807192807193, "grad_norm": 1.9277124404907227, "learning_rate": 6.919312910953669e-05, "loss": 1.045, "step": 6445 }, { "epoch": 0.3221778221778222, "grad_norm": 1.8853377103805542, "learning_rate": 6.916764361078547e-05, "loss": 1.0145, "step": 6450 }, { "epoch": 0.3224275724275724, "grad_norm": 2.098036050796509, "learning_rate": 6.914215811203426e-05, "loss": 1.0922, "step": 6455 }, { "epoch": 0.3226773226773227, "grad_norm": 1.918009877204895, "learning_rate": 6.911667261328304e-05, "loss": 1.0024, "step": 6460 }, { "epoch": 0.3229270729270729, "grad_norm": 1.7543106079101562, "learning_rate": 6.909118711453183e-05, "loss": 1.0132, "step": 6465 }, { "epoch": 0.32317682317682317, "grad_norm": 2.2700929641723633, "learning_rate": 6.906570161578061e-05, "loss": 1.005, "step": 6470 }, { "epoch": 0.32342657342657344, "grad_norm": 1.7370915412902832, "learning_rate": 6.904021611702941e-05, "loss": 1.0469, "step": 6475 }, { "epoch": 0.32367632367632365, "grad_norm": 2.1080522537231445, "learning_rate": 6.90147306182782e-05, "loss": 1.1069, "step": 6480 }, { "epoch": 0.3239260739260739, "grad_norm": 2.378490686416626, "learning_rate": 6.898924511952698e-05, "loss": 0.98, "step": 6485 }, { "epoch": 0.3241758241758242, "grad_norm": 1.8942471742630005, "learning_rate": 6.896375962077578e-05, "loss": 1.01, "step": 6490 }, { "epoch": 0.3244255744255744, "grad_norm": 2.146806478500366, "learning_rate": 6.893827412202457e-05, "loss": 1.1569, "step": 6495 }, { "epoch": 0.3246753246753247, "grad_norm": 2.2771503925323486, "learning_rate": 6.891278862327337e-05, "loss": 0.9925, "step": 6500 }, { "epoch": 0.32492507492507494, "grad_norm": 1.8457711935043335, "learning_rate": 6.888730312452215e-05, "loss": 1.0846, "step": 6505 }, { "epoch": 0.32517482517482516, "grad_norm": 1.7805609703063965, "learning_rate": 6.886181762577094e-05, "loss": 1.0914, "step": 6510 }, { "epoch": 0.3254245754245754, "grad_norm": 2.8742549419403076, "learning_rate": 6.883633212701974e-05, "loss": 1.0426, "step": 6515 }, { "epoch": 0.3256743256743257, "grad_norm": 1.9245723485946655, "learning_rate": 6.881084662826852e-05, "loss": 1.1173, "step": 6520 }, { "epoch": 0.3259240759240759, "grad_norm": 2.211257219314575, "learning_rate": 6.878536112951731e-05, "loss": 1.0387, "step": 6525 }, { "epoch": 0.3261738261738262, "grad_norm": 1.8026940822601318, "learning_rate": 6.875987563076611e-05, "loss": 1.0191, "step": 6530 }, { "epoch": 0.32642357642357644, "grad_norm": 2.5407819747924805, "learning_rate": 6.873439013201489e-05, "loss": 0.996, "step": 6535 }, { "epoch": 0.32667332667332666, "grad_norm": 2.2497782707214355, "learning_rate": 6.870890463326368e-05, "loss": 1.0354, "step": 6540 }, { "epoch": 0.3269230769230769, "grad_norm": 1.9373350143432617, "learning_rate": 6.868341913451246e-05, "loss": 1.1115, "step": 6545 }, { "epoch": 0.3271728271728272, "grad_norm": 1.8085883855819702, "learning_rate": 6.865793363576125e-05, "loss": 1.1005, "step": 6550 }, { "epoch": 0.3274225774225774, "grad_norm": 1.4361785650253296, "learning_rate": 6.863244813701005e-05, "loss": 1.0313, "step": 6555 }, { "epoch": 0.3276723276723277, "grad_norm": 2.1871395111083984, "learning_rate": 6.860696263825883e-05, "loss": 1.0101, "step": 6560 }, { "epoch": 0.32792207792207795, "grad_norm": 3.1070055961608887, "learning_rate": 6.858147713950762e-05, "loss": 0.8832, "step": 6565 }, { "epoch": 0.32817182817182816, "grad_norm": 2.3439877033233643, "learning_rate": 6.855599164075642e-05, "loss": 1.0518, "step": 6570 }, { "epoch": 0.32842157842157843, "grad_norm": 1.7471606731414795, "learning_rate": 6.85305061420052e-05, "loss": 1.194, "step": 6575 }, { "epoch": 0.32867132867132864, "grad_norm": 2.164783239364624, "learning_rate": 6.850502064325399e-05, "loss": 1.0434, "step": 6580 }, { "epoch": 0.3289210789210789, "grad_norm": 1.5020499229431152, "learning_rate": 6.847953514450279e-05, "loss": 1.0345, "step": 6585 }, { "epoch": 0.3291708291708292, "grad_norm": 2.447230577468872, "learning_rate": 6.845404964575157e-05, "loss": 1.0586, "step": 6590 }, { "epoch": 0.3294205794205794, "grad_norm": 3.978895902633667, "learning_rate": 6.842856414700036e-05, "loss": 1.0381, "step": 6595 }, { "epoch": 0.32967032967032966, "grad_norm": 2.322222948074341, "learning_rate": 6.840307864824916e-05, "loss": 1.0276, "step": 6600 }, { "epoch": 0.32992007992007993, "grad_norm": 1.8306394815444946, "learning_rate": 6.837759314949794e-05, "loss": 0.9978, "step": 6605 }, { "epoch": 0.33016983016983015, "grad_norm": 1.8843820095062256, "learning_rate": 6.835210765074673e-05, "loss": 1.0254, "step": 6610 }, { "epoch": 0.3304195804195804, "grad_norm": 1.6963485479354858, "learning_rate": 6.832662215199553e-05, "loss": 1.1329, "step": 6615 }, { "epoch": 0.3306693306693307, "grad_norm": 1.8296513557434082, "learning_rate": 6.83011366532443e-05, "loss": 1.1591, "step": 6620 }, { "epoch": 0.3309190809190809, "grad_norm": 2.5539371967315674, "learning_rate": 6.82756511544931e-05, "loss": 1.0363, "step": 6625 }, { "epoch": 0.33116883116883117, "grad_norm": 1.758256196975708, "learning_rate": 6.825016565574188e-05, "loss": 0.9915, "step": 6630 }, { "epoch": 0.33141858141858144, "grad_norm": 1.7679901123046875, "learning_rate": 6.822468015699067e-05, "loss": 1.0399, "step": 6635 }, { "epoch": 0.33166833166833165, "grad_norm": 2.0953726768493652, "learning_rate": 6.819919465823947e-05, "loss": 1.0078, "step": 6640 }, { "epoch": 0.3319180819180819, "grad_norm": 2.480586290359497, "learning_rate": 6.817370915948825e-05, "loss": 1.0225, "step": 6645 }, { "epoch": 0.3321678321678322, "grad_norm": 1.4508764743804932, "learning_rate": 6.814822366073704e-05, "loss": 1.0455, "step": 6650 }, { "epoch": 0.3324175824175824, "grad_norm": 1.4755468368530273, "learning_rate": 6.812273816198584e-05, "loss": 1.059, "step": 6655 }, { "epoch": 0.33266733266733267, "grad_norm": 1.8383623361587524, "learning_rate": 6.809725266323462e-05, "loss": 1.0837, "step": 6660 }, { "epoch": 0.33291708291708294, "grad_norm": 1.7846354246139526, "learning_rate": 6.807176716448341e-05, "loss": 1.0636, "step": 6665 }, { "epoch": 0.33316683316683315, "grad_norm": 2.093674421310425, "learning_rate": 6.80462816657322e-05, "loss": 1.0765, "step": 6670 }, { "epoch": 0.3334165834165834, "grad_norm": 1.7678923606872559, "learning_rate": 6.802079616698099e-05, "loss": 0.9824, "step": 6675 }, { "epoch": 0.3336663336663337, "grad_norm": 1.5662291049957275, "learning_rate": 6.799531066822978e-05, "loss": 0.9042, "step": 6680 }, { "epoch": 0.3339160839160839, "grad_norm": 1.687313437461853, "learning_rate": 6.796982516947858e-05, "loss": 0.9901, "step": 6685 }, { "epoch": 0.3341658341658342, "grad_norm": 2.0302696228027344, "learning_rate": 6.794433967072736e-05, "loss": 0.9086, "step": 6690 }, { "epoch": 0.3344155844155844, "grad_norm": 2.009594440460205, "learning_rate": 6.791885417197615e-05, "loss": 1.0366, "step": 6695 }, { "epoch": 0.33466533466533466, "grad_norm": 1.8643195629119873, "learning_rate": 6.789336867322493e-05, "loss": 1.0822, "step": 6700 }, { "epoch": 0.3349150849150849, "grad_norm": 1.8301814794540405, "learning_rate": 6.786788317447372e-05, "loss": 1.0889, "step": 6705 }, { "epoch": 0.33516483516483514, "grad_norm": 1.9102551937103271, "learning_rate": 6.784239767572252e-05, "loss": 0.9281, "step": 6710 }, { "epoch": 0.3354145854145854, "grad_norm": 1.8911617994308472, "learning_rate": 6.78169121769713e-05, "loss": 0.8924, "step": 6715 }, { "epoch": 0.3356643356643357, "grad_norm": 1.7902599573135376, "learning_rate": 6.779142667822009e-05, "loss": 0.9144, "step": 6720 }, { "epoch": 0.3359140859140859, "grad_norm": 1.7835780382156372, "learning_rate": 6.776594117946889e-05, "loss": 1.0449, "step": 6725 }, { "epoch": 0.33616383616383616, "grad_norm": 2.1837122440338135, "learning_rate": 6.774045568071767e-05, "loss": 1.0208, "step": 6730 }, { "epoch": 0.33641358641358643, "grad_norm": 1.9016153812408447, "learning_rate": 6.771497018196646e-05, "loss": 0.953, "step": 6735 }, { "epoch": 0.33666333666333664, "grad_norm": 1.817519187927246, "learning_rate": 6.768948468321526e-05, "loss": 1.0966, "step": 6740 }, { "epoch": 0.3369130869130869, "grad_norm": 2.220903158187866, "learning_rate": 6.766399918446404e-05, "loss": 1.0016, "step": 6745 }, { "epoch": 0.3371628371628372, "grad_norm": 1.7414711713790894, "learning_rate": 6.763851368571284e-05, "loss": 0.9801, "step": 6750 }, { "epoch": 0.3374125874125874, "grad_norm": 1.820414423942566, "learning_rate": 6.761302818696163e-05, "loss": 1.0932, "step": 6755 }, { "epoch": 0.33766233766233766, "grad_norm": 1.5958203077316284, "learning_rate": 6.758754268821041e-05, "loss": 1.0537, "step": 6760 }, { "epoch": 0.33791208791208793, "grad_norm": 1.9953911304473877, "learning_rate": 6.756205718945921e-05, "loss": 1.0815, "step": 6765 }, { "epoch": 0.33816183816183815, "grad_norm": 2.0490479469299316, "learning_rate": 6.7536571690708e-05, "loss": 1.054, "step": 6770 }, { "epoch": 0.3384115884115884, "grad_norm": 2.0254197120666504, "learning_rate": 6.751108619195678e-05, "loss": 1.0589, "step": 6775 }, { "epoch": 0.3386613386613387, "grad_norm": 1.7392507791519165, "learning_rate": 6.748560069320557e-05, "loss": 1.0483, "step": 6780 }, { "epoch": 0.3389110889110889, "grad_norm": 1.5730153322219849, "learning_rate": 6.746011519445435e-05, "loss": 1.0215, "step": 6785 }, { "epoch": 0.33916083916083917, "grad_norm": 2.1709887981414795, "learning_rate": 6.743462969570314e-05, "loss": 1.0654, "step": 6790 }, { "epoch": 0.33941058941058944, "grad_norm": 1.6566568613052368, "learning_rate": 6.740914419695194e-05, "loss": 1.1017, "step": 6795 }, { "epoch": 0.33966033966033965, "grad_norm": 2.193427562713623, "learning_rate": 6.738365869820072e-05, "loss": 0.9823, "step": 6800 }, { "epoch": 0.3399100899100899, "grad_norm": 1.9699043035507202, "learning_rate": 6.735817319944952e-05, "loss": 1.0338, "step": 6805 }, { "epoch": 0.34015984015984013, "grad_norm": 1.8515230417251587, "learning_rate": 6.73326877006983e-05, "loss": 1.0833, "step": 6810 }, { "epoch": 0.3404095904095904, "grad_norm": 1.6545101404190063, "learning_rate": 6.730720220194709e-05, "loss": 1.0573, "step": 6815 }, { "epoch": 0.34065934065934067, "grad_norm": 2.5793654918670654, "learning_rate": 6.728171670319589e-05, "loss": 1.0264, "step": 6820 }, { "epoch": 0.3409090909090909, "grad_norm": 1.9767229557037354, "learning_rate": 6.725623120444468e-05, "loss": 0.9903, "step": 6825 }, { "epoch": 0.34115884115884115, "grad_norm": 1.7715178728103638, "learning_rate": 6.723074570569346e-05, "loss": 1.1434, "step": 6830 }, { "epoch": 0.3414085914085914, "grad_norm": 1.484311580657959, "learning_rate": 6.720526020694226e-05, "loss": 1.0258, "step": 6835 }, { "epoch": 0.34165834165834164, "grad_norm": 1.669066071510315, "learning_rate": 6.717977470819105e-05, "loss": 1.0215, "step": 6840 }, { "epoch": 0.3419080919080919, "grad_norm": 1.863688588142395, "learning_rate": 6.715428920943983e-05, "loss": 1.0705, "step": 6845 }, { "epoch": 0.3421578421578422, "grad_norm": 1.884108304977417, "learning_rate": 6.712880371068863e-05, "loss": 1.0548, "step": 6850 }, { "epoch": 0.3424075924075924, "grad_norm": 2.5251309871673584, "learning_rate": 6.710331821193742e-05, "loss": 0.9499, "step": 6855 }, { "epoch": 0.34265734265734266, "grad_norm": 1.6779309511184692, "learning_rate": 6.70778327131862e-05, "loss": 0.9498, "step": 6860 }, { "epoch": 0.3429070929070929, "grad_norm": 2.285132884979248, "learning_rate": 6.705234721443499e-05, "loss": 1.0461, "step": 6865 }, { "epoch": 0.34315684315684314, "grad_norm": 2.100578784942627, "learning_rate": 6.702686171568377e-05, "loss": 0.9654, "step": 6870 }, { "epoch": 0.3434065934065934, "grad_norm": 1.7280919551849365, "learning_rate": 6.700137621693257e-05, "loss": 1.0376, "step": 6875 }, { "epoch": 0.3436563436563437, "grad_norm": 1.8477983474731445, "learning_rate": 6.697589071818136e-05, "loss": 0.9869, "step": 6880 }, { "epoch": 0.3439060939060939, "grad_norm": 1.6880420446395874, "learning_rate": 6.695040521943014e-05, "loss": 1.0229, "step": 6885 }, { "epoch": 0.34415584415584416, "grad_norm": 1.8154518604278564, "learning_rate": 6.692491972067894e-05, "loss": 1.1337, "step": 6890 }, { "epoch": 0.34440559440559443, "grad_norm": 1.6728017330169678, "learning_rate": 6.689943422192773e-05, "loss": 0.996, "step": 6895 }, { "epoch": 0.34465534465534464, "grad_norm": 2.2039999961853027, "learning_rate": 6.687394872317651e-05, "loss": 0.9854, "step": 6900 }, { "epoch": 0.3449050949050949, "grad_norm": 2.0283381938934326, "learning_rate": 6.684846322442531e-05, "loss": 0.9173, "step": 6905 }, { "epoch": 0.3451548451548452, "grad_norm": 2.2166800498962402, "learning_rate": 6.68229777256741e-05, "loss": 1.0394, "step": 6910 }, { "epoch": 0.3454045954045954, "grad_norm": 2.1432483196258545, "learning_rate": 6.679749222692288e-05, "loss": 0.9765, "step": 6915 }, { "epoch": 0.34565434565434566, "grad_norm": 1.740493893623352, "learning_rate": 6.677200672817168e-05, "loss": 0.9686, "step": 6920 }, { "epoch": 0.34590409590409593, "grad_norm": 1.867631196975708, "learning_rate": 6.674652122942047e-05, "loss": 0.9966, "step": 6925 }, { "epoch": 0.34615384615384615, "grad_norm": 2.9544172286987305, "learning_rate": 6.672103573066925e-05, "loss": 1.0435, "step": 6930 }, { "epoch": 0.3464035964035964, "grad_norm": 1.6327377557754517, "learning_rate": 6.669555023191805e-05, "loss": 0.9429, "step": 6935 }, { "epoch": 0.34665334665334663, "grad_norm": 2.363645315170288, "learning_rate": 6.667006473316682e-05, "loss": 1.0809, "step": 6940 }, { "epoch": 0.3469030969030969, "grad_norm": 3.817553997039795, "learning_rate": 6.664457923441562e-05, "loss": 1.1031, "step": 6945 }, { "epoch": 0.34715284715284717, "grad_norm": 2.0058345794677734, "learning_rate": 6.66190937356644e-05, "loss": 1.073, "step": 6950 }, { "epoch": 0.3474025974025974, "grad_norm": 1.6295661926269531, "learning_rate": 6.659360823691319e-05, "loss": 1.007, "step": 6955 }, { "epoch": 0.34765234765234765, "grad_norm": 1.8767647743225098, "learning_rate": 6.656812273816199e-05, "loss": 0.9889, "step": 6960 }, { "epoch": 0.3479020979020979, "grad_norm": 2.1724014282226562, "learning_rate": 6.654263723941078e-05, "loss": 1.1354, "step": 6965 }, { "epoch": 0.34815184815184813, "grad_norm": 2.0424623489379883, "learning_rate": 6.651715174065956e-05, "loss": 1.0469, "step": 6970 }, { "epoch": 0.3484015984015984, "grad_norm": 1.7980090379714966, "learning_rate": 6.649166624190836e-05, "loss": 1.0419, "step": 6975 }, { "epoch": 0.34865134865134867, "grad_norm": 1.8220294713974, "learning_rate": 6.646618074315715e-05, "loss": 0.988, "step": 6980 }, { "epoch": 0.3489010989010989, "grad_norm": 1.8621596097946167, "learning_rate": 6.644069524440593e-05, "loss": 1.033, "step": 6985 }, { "epoch": 0.34915084915084915, "grad_norm": 1.9534958600997925, "learning_rate": 6.641520974565473e-05, "loss": 1.0584, "step": 6990 }, { "epoch": 0.3494005994005994, "grad_norm": 1.489283800125122, "learning_rate": 6.638972424690352e-05, "loss": 0.9959, "step": 6995 }, { "epoch": 0.34965034965034963, "grad_norm": 2.492037296295166, "learning_rate": 6.63642387481523e-05, "loss": 1.0362, "step": 7000 }, { "epoch": 0.3499000999000999, "grad_norm": 2.011425018310547, "learning_rate": 6.63387532494011e-05, "loss": 1.0939, "step": 7005 }, { "epoch": 0.3501498501498502, "grad_norm": 2.5336670875549316, "learning_rate": 6.631326775064989e-05, "loss": 1.0549, "step": 7010 }, { "epoch": 0.3503996003996004, "grad_norm": 1.7394782304763794, "learning_rate": 6.628778225189868e-05, "loss": 1.0872, "step": 7015 }, { "epoch": 0.35064935064935066, "grad_norm": 1.8983365297317505, "learning_rate": 6.626229675314746e-05, "loss": 1.0867, "step": 7020 }, { "epoch": 0.3508991008991009, "grad_norm": 1.7626093626022339, "learning_rate": 6.623681125439624e-05, "loss": 1.1223, "step": 7025 }, { "epoch": 0.35114885114885114, "grad_norm": 1.776146411895752, "learning_rate": 6.621132575564504e-05, "loss": 0.9976, "step": 7030 }, { "epoch": 0.3513986013986014, "grad_norm": 1.8615010976791382, "learning_rate": 6.618584025689383e-05, "loss": 1.0287, "step": 7035 }, { "epoch": 0.3516483516483517, "grad_norm": 1.4609383344650269, "learning_rate": 6.616035475814261e-05, "loss": 0.9588, "step": 7040 }, { "epoch": 0.3518981018981019, "grad_norm": 1.915842056274414, "learning_rate": 6.613486925939141e-05, "loss": 1.0668, "step": 7045 }, { "epoch": 0.35214785214785216, "grad_norm": 2.43540620803833, "learning_rate": 6.61093837606402e-05, "loss": 1.0413, "step": 7050 }, { "epoch": 0.35239760239760237, "grad_norm": 1.778512954711914, "learning_rate": 6.6083898261889e-05, "loss": 1.0816, "step": 7055 }, { "epoch": 0.35264735264735264, "grad_norm": 2.1932787895202637, "learning_rate": 6.605841276313778e-05, "loss": 1.1106, "step": 7060 }, { "epoch": 0.3528971028971029, "grad_norm": 1.9881888628005981, "learning_rate": 6.603292726438657e-05, "loss": 1.02, "step": 7065 }, { "epoch": 0.3531468531468531, "grad_norm": 1.966972827911377, "learning_rate": 6.600744176563536e-05, "loss": 0.9705, "step": 7070 }, { "epoch": 0.3533966033966034, "grad_norm": 2.1445083618164062, "learning_rate": 6.598195626688415e-05, "loss": 0.922, "step": 7075 }, { "epoch": 0.35364635364635366, "grad_norm": 1.6460120677947998, "learning_rate": 6.595647076813294e-05, "loss": 1.1013, "step": 7080 }, { "epoch": 0.3538961038961039, "grad_norm": 2.2874999046325684, "learning_rate": 6.593098526938173e-05, "loss": 0.9498, "step": 7085 }, { "epoch": 0.35414585414585414, "grad_norm": 1.5944546461105347, "learning_rate": 6.590549977063052e-05, "loss": 1.0214, "step": 7090 }, { "epoch": 0.3543956043956044, "grad_norm": 1.5479800701141357, "learning_rate": 6.58800142718793e-05, "loss": 0.9717, "step": 7095 }, { "epoch": 0.3546453546453546, "grad_norm": 1.4920616149902344, "learning_rate": 6.585452877312809e-05, "loss": 1.0323, "step": 7100 }, { "epoch": 0.3548951048951049, "grad_norm": 1.9111156463623047, "learning_rate": 6.582904327437688e-05, "loss": 1.0772, "step": 7105 }, { "epoch": 0.35514485514485516, "grad_norm": 1.8040502071380615, "learning_rate": 6.580355777562567e-05, "loss": 1.1045, "step": 7110 }, { "epoch": 0.3553946053946054, "grad_norm": 2.191572666168213, "learning_rate": 6.577807227687446e-05, "loss": 1.0472, "step": 7115 }, { "epoch": 0.35564435564435565, "grad_norm": 2.430616617202759, "learning_rate": 6.575258677812325e-05, "loss": 1.0189, "step": 7120 }, { "epoch": 0.3558941058941059, "grad_norm": 1.3919936418533325, "learning_rate": 6.572710127937204e-05, "loss": 0.982, "step": 7125 }, { "epoch": 0.35614385614385613, "grad_norm": 1.8597478866577148, "learning_rate": 6.570161578062083e-05, "loss": 1.0026, "step": 7130 }, { "epoch": 0.3563936063936064, "grad_norm": 1.771545648574829, "learning_rate": 6.567613028186962e-05, "loss": 0.9777, "step": 7135 }, { "epoch": 0.35664335664335667, "grad_norm": 2.4949798583984375, "learning_rate": 6.565064478311841e-05, "loss": 1.0609, "step": 7140 }, { "epoch": 0.3568931068931069, "grad_norm": 2.1426548957824707, "learning_rate": 6.56251592843672e-05, "loss": 0.9224, "step": 7145 }, { "epoch": 0.35714285714285715, "grad_norm": 1.516606092453003, "learning_rate": 6.559967378561599e-05, "loss": 1.0719, "step": 7150 }, { "epoch": 0.3573926073926074, "grad_norm": 2.19494366645813, "learning_rate": 6.557418828686478e-05, "loss": 1.0002, "step": 7155 }, { "epoch": 0.35764235764235763, "grad_norm": 1.6472409963607788, "learning_rate": 6.554870278811357e-05, "loss": 0.9468, "step": 7160 }, { "epoch": 0.3578921078921079, "grad_norm": 2.0603489875793457, "learning_rate": 6.552321728936235e-05, "loss": 1.1041, "step": 7165 }, { "epoch": 0.3581418581418581, "grad_norm": 3.1564693450927734, "learning_rate": 6.549773179061115e-05, "loss": 0.9655, "step": 7170 }, { "epoch": 0.3583916083916084, "grad_norm": 2.0714056491851807, "learning_rate": 6.547224629185994e-05, "loss": 0.9795, "step": 7175 }, { "epoch": 0.35864135864135865, "grad_norm": 2.8105690479278564, "learning_rate": 6.544676079310872e-05, "loss": 1.0101, "step": 7180 }, { "epoch": 0.35889110889110887, "grad_norm": 1.8362126350402832, "learning_rate": 6.542127529435751e-05, "loss": 1.0437, "step": 7185 }, { "epoch": 0.35914085914085914, "grad_norm": 1.8384443521499634, "learning_rate": 6.53957897956063e-05, "loss": 0.9997, "step": 7190 }, { "epoch": 0.3593906093906094, "grad_norm": 1.921250581741333, "learning_rate": 6.53703042968551e-05, "loss": 1.1129, "step": 7195 }, { "epoch": 0.3596403596403596, "grad_norm": 1.7445088624954224, "learning_rate": 6.534481879810388e-05, "loss": 0.9657, "step": 7200 }, { "epoch": 0.3598901098901099, "grad_norm": 2.0287718772888184, "learning_rate": 6.531933329935267e-05, "loss": 1.0302, "step": 7205 }, { "epoch": 0.36013986013986016, "grad_norm": 2.0540621280670166, "learning_rate": 6.529384780060146e-05, "loss": 0.9622, "step": 7210 }, { "epoch": 0.36038961038961037, "grad_norm": 2.121351957321167, "learning_rate": 6.526836230185025e-05, "loss": 1.0072, "step": 7215 }, { "epoch": 0.36063936063936064, "grad_norm": 1.7123279571533203, "learning_rate": 6.524287680309903e-05, "loss": 0.9287, "step": 7220 }, { "epoch": 0.3608891108891109, "grad_norm": 1.9038200378417969, "learning_rate": 6.521739130434783e-05, "loss": 1.0385, "step": 7225 }, { "epoch": 0.3611388611388611, "grad_norm": 1.7427841424942017, "learning_rate": 6.519190580559662e-05, "loss": 1.1349, "step": 7230 }, { "epoch": 0.3613886113886114, "grad_norm": 1.5821549892425537, "learning_rate": 6.51664203068454e-05, "loss": 1.0014, "step": 7235 }, { "epoch": 0.36163836163836166, "grad_norm": 1.7430912256240845, "learning_rate": 6.51409348080942e-05, "loss": 1.0734, "step": 7240 }, { "epoch": 0.3618881118881119, "grad_norm": 1.906278133392334, "learning_rate": 6.511544930934299e-05, "loss": 0.976, "step": 7245 }, { "epoch": 0.36213786213786214, "grad_norm": 1.6206927299499512, "learning_rate": 6.508996381059177e-05, "loss": 0.9911, "step": 7250 }, { "epoch": 0.3623876123876124, "grad_norm": 2.6637444496154785, "learning_rate": 6.506447831184057e-05, "loss": 1.0924, "step": 7255 }, { "epoch": 0.3626373626373626, "grad_norm": 2.3829855918884277, "learning_rate": 6.503899281308935e-05, "loss": 0.9521, "step": 7260 }, { "epoch": 0.3628871128871129, "grad_norm": 1.8421683311462402, "learning_rate": 6.501350731433814e-05, "loss": 0.9618, "step": 7265 }, { "epoch": 0.36313686313686316, "grad_norm": 2.104562520980835, "learning_rate": 6.498802181558693e-05, "loss": 1.068, "step": 7270 }, { "epoch": 0.3633866133866134, "grad_norm": 2.01918363571167, "learning_rate": 6.496253631683572e-05, "loss": 1.0946, "step": 7275 }, { "epoch": 0.36363636363636365, "grad_norm": 1.788277268409729, "learning_rate": 6.493705081808451e-05, "loss": 0.9609, "step": 7280 }, { "epoch": 0.36388611388611386, "grad_norm": 2.0937082767486572, "learning_rate": 6.49115653193333e-05, "loss": 1.0192, "step": 7285 }, { "epoch": 0.36413586413586413, "grad_norm": 1.8669873476028442, "learning_rate": 6.488607982058208e-05, "loss": 1.0826, "step": 7290 }, { "epoch": 0.3643856143856144, "grad_norm": 1.4630852937698364, "learning_rate": 6.486059432183088e-05, "loss": 0.9921, "step": 7295 }, { "epoch": 0.3646353646353646, "grad_norm": 1.7921289205551147, "learning_rate": 6.483510882307967e-05, "loss": 0.9769, "step": 7300 }, { "epoch": 0.3648851148851149, "grad_norm": 1.750465989112854, "learning_rate": 6.480962332432845e-05, "loss": 1.0372, "step": 7305 }, { "epoch": 0.36513486513486515, "grad_norm": 2.1092684268951416, "learning_rate": 6.478413782557725e-05, "loss": 1.162, "step": 7310 }, { "epoch": 0.36538461538461536, "grad_norm": 1.9966676235198975, "learning_rate": 6.475865232682604e-05, "loss": 1.0453, "step": 7315 }, { "epoch": 0.36563436563436563, "grad_norm": 1.9721391201019287, "learning_rate": 6.473316682807484e-05, "loss": 1.0586, "step": 7320 }, { "epoch": 0.3658841158841159, "grad_norm": 3.088454008102417, "learning_rate": 6.470768132932362e-05, "loss": 0.9656, "step": 7325 }, { "epoch": 0.3661338661338661, "grad_norm": 1.8847544193267822, "learning_rate": 6.468219583057241e-05, "loss": 1.0032, "step": 7330 }, { "epoch": 0.3663836163836164, "grad_norm": 2.1002490520477295, "learning_rate": 6.465671033182121e-05, "loss": 0.9903, "step": 7335 }, { "epoch": 0.36663336663336665, "grad_norm": 1.9421480894088745, "learning_rate": 6.463122483306998e-05, "loss": 1.0212, "step": 7340 }, { "epoch": 0.36688311688311687, "grad_norm": 1.443678855895996, "learning_rate": 6.460573933431876e-05, "loss": 1.0677, "step": 7345 }, { "epoch": 0.36713286713286714, "grad_norm": 1.8396480083465576, "learning_rate": 6.458025383556756e-05, "loss": 1.0301, "step": 7350 }, { "epoch": 0.3673826173826174, "grad_norm": 2.2769358158111572, "learning_rate": 6.455476833681635e-05, "loss": 1.0505, "step": 7355 }, { "epoch": 0.3676323676323676, "grad_norm": 1.856937050819397, "learning_rate": 6.452928283806515e-05, "loss": 0.9785, "step": 7360 }, { "epoch": 0.3678821178821179, "grad_norm": 2.336183547973633, "learning_rate": 6.450379733931393e-05, "loss": 1.0499, "step": 7365 }, { "epoch": 0.36813186813186816, "grad_norm": 1.690975308418274, "learning_rate": 6.447831184056272e-05, "loss": 1.2334, "step": 7370 }, { "epoch": 0.36838161838161837, "grad_norm": 1.9701261520385742, "learning_rate": 6.445282634181152e-05, "loss": 0.9827, "step": 7375 }, { "epoch": 0.36863136863136864, "grad_norm": 2.4056832790374756, "learning_rate": 6.44273408430603e-05, "loss": 1.0255, "step": 7380 }, { "epoch": 0.3688811188811189, "grad_norm": 2.1437337398529053, "learning_rate": 6.440185534430909e-05, "loss": 1.0682, "step": 7385 }, { "epoch": 0.3691308691308691, "grad_norm": 1.690464735031128, "learning_rate": 6.437636984555789e-05, "loss": 1.015, "step": 7390 }, { "epoch": 0.3693806193806194, "grad_norm": 2.64349627494812, "learning_rate": 6.435088434680667e-05, "loss": 1.0083, "step": 7395 }, { "epoch": 0.3696303696303696, "grad_norm": 1.578401803970337, "learning_rate": 6.432539884805546e-05, "loss": 1.1007, "step": 7400 }, { "epoch": 0.3698801198801199, "grad_norm": 1.4217660427093506, "learning_rate": 6.429991334930426e-05, "loss": 0.991, "step": 7405 }, { "epoch": 0.37012987012987014, "grad_norm": 2.282193660736084, "learning_rate": 6.427442785055304e-05, "loss": 1.0136, "step": 7410 }, { "epoch": 0.37037962037962036, "grad_norm": 1.8046737909317017, "learning_rate": 6.424894235180183e-05, "loss": 1.001, "step": 7415 }, { "epoch": 0.3706293706293706, "grad_norm": 1.7892814874649048, "learning_rate": 6.422345685305061e-05, "loss": 1.0044, "step": 7420 }, { "epoch": 0.3708791208791209, "grad_norm": 1.7249891757965088, "learning_rate": 6.41979713542994e-05, "loss": 1.0505, "step": 7425 }, { "epoch": 0.3711288711288711, "grad_norm": 1.6267375946044922, "learning_rate": 6.41724858555482e-05, "loss": 1.0564, "step": 7430 }, { "epoch": 0.3713786213786214, "grad_norm": 1.2479029893875122, "learning_rate": 6.414700035679698e-05, "loss": 1.0481, "step": 7435 }, { "epoch": 0.37162837162837165, "grad_norm": 2.2487740516662598, "learning_rate": 6.412151485804577e-05, "loss": 1.0094, "step": 7440 }, { "epoch": 0.37187812187812186, "grad_norm": 3.271279811859131, "learning_rate": 6.409602935929457e-05, "loss": 1.0937, "step": 7445 }, { "epoch": 0.37212787212787213, "grad_norm": 1.6846765279769897, "learning_rate": 6.407054386054335e-05, "loss": 1.0789, "step": 7450 }, { "epoch": 0.3723776223776224, "grad_norm": 1.5623890161514282, "learning_rate": 6.404505836179214e-05, "loss": 1.0531, "step": 7455 }, { "epoch": 0.3726273726273726, "grad_norm": 1.694534420967102, "learning_rate": 6.401957286304094e-05, "loss": 1.1043, "step": 7460 }, { "epoch": 0.3728771228771229, "grad_norm": 1.5684089660644531, "learning_rate": 6.399408736428972e-05, "loss": 0.9736, "step": 7465 }, { "epoch": 0.37312687312687315, "grad_norm": 1.5486855506896973, "learning_rate": 6.396860186553851e-05, "loss": 1.0439, "step": 7470 }, { "epoch": 0.37337662337662336, "grad_norm": 1.7873612642288208, "learning_rate": 6.394311636678731e-05, "loss": 1.1133, "step": 7475 }, { "epoch": 0.37362637362637363, "grad_norm": 1.7592064142227173, "learning_rate": 6.391763086803609e-05, "loss": 1.0163, "step": 7480 }, { "epoch": 0.3738761238761239, "grad_norm": 1.801048994064331, "learning_rate": 6.389214536928488e-05, "loss": 1.0608, "step": 7485 }, { "epoch": 0.3741258741258741, "grad_norm": 1.920106053352356, "learning_rate": 6.386665987053368e-05, "loss": 1.1211, "step": 7490 }, { "epoch": 0.3743756243756244, "grad_norm": 2.219479560852051, "learning_rate": 6.384117437178246e-05, "loss": 1.0455, "step": 7495 }, { "epoch": 0.37462537462537465, "grad_norm": 2.1176064014434814, "learning_rate": 6.381568887303125e-05, "loss": 1.1607, "step": 7500 }, { "epoch": 0.37487512487512487, "grad_norm": 2.15228533744812, "learning_rate": 6.379020337428003e-05, "loss": 1.056, "step": 7505 }, { "epoch": 0.37512487512487513, "grad_norm": 1.8641549348831177, "learning_rate": 6.376471787552882e-05, "loss": 1.0816, "step": 7510 }, { "epoch": 0.37537462537462535, "grad_norm": 1.5511192083358765, "learning_rate": 6.373923237677762e-05, "loss": 1.0896, "step": 7515 }, { "epoch": 0.3756243756243756, "grad_norm": 2.1165225505828857, "learning_rate": 6.37137468780264e-05, "loss": 1.0925, "step": 7520 }, { "epoch": 0.3758741258741259, "grad_norm": 1.6934535503387451, "learning_rate": 6.368826137927519e-05, "loss": 0.8896, "step": 7525 }, { "epoch": 0.3761238761238761, "grad_norm": 1.7847000360488892, "learning_rate": 6.366277588052399e-05, "loss": 1.0985, "step": 7530 }, { "epoch": 0.37637362637362637, "grad_norm": 1.9369451999664307, "learning_rate": 6.363729038177277e-05, "loss": 1.0078, "step": 7535 }, { "epoch": 0.37662337662337664, "grad_norm": 1.6640955209732056, "learning_rate": 6.361180488302156e-05, "loss": 1.0518, "step": 7540 }, { "epoch": 0.37687312687312685, "grad_norm": 2.27824330329895, "learning_rate": 6.358631938427036e-05, "loss": 1.0455, "step": 7545 }, { "epoch": 0.3771228771228771, "grad_norm": 3.833918333053589, "learning_rate": 6.356083388551914e-05, "loss": 1.0763, "step": 7550 }, { "epoch": 0.3773726273726274, "grad_norm": 1.7248731851577759, "learning_rate": 6.353534838676793e-05, "loss": 1.046, "step": 7555 }, { "epoch": 0.3776223776223776, "grad_norm": 2.270656108856201, "learning_rate": 6.350986288801673e-05, "loss": 1.1374, "step": 7560 }, { "epoch": 0.37787212787212787, "grad_norm": 1.6801711320877075, "learning_rate": 6.348437738926551e-05, "loss": 1.051, "step": 7565 }, { "epoch": 0.37812187812187814, "grad_norm": 2.2533745765686035, "learning_rate": 6.345889189051431e-05, "loss": 1.0927, "step": 7570 }, { "epoch": 0.37837162837162835, "grad_norm": 1.6258869171142578, "learning_rate": 6.34334063917631e-05, "loss": 1.1323, "step": 7575 }, { "epoch": 0.3786213786213786, "grad_norm": 2.0186474323272705, "learning_rate": 6.340792089301187e-05, "loss": 1.0799, "step": 7580 }, { "epoch": 0.3788711288711289, "grad_norm": 2.309969663619995, "learning_rate": 6.338243539426067e-05, "loss": 1.0725, "step": 7585 }, { "epoch": 0.3791208791208791, "grad_norm": 1.916247844696045, "learning_rate": 6.335694989550945e-05, "loss": 0.9798, "step": 7590 }, { "epoch": 0.3793706293706294, "grad_norm": 1.6921613216400146, "learning_rate": 6.333146439675824e-05, "loss": 1.0227, "step": 7595 }, { "epoch": 0.37962037962037964, "grad_norm": 1.8067572116851807, "learning_rate": 6.330597889800704e-05, "loss": 0.9897, "step": 7600 }, { "epoch": 0.37987012987012986, "grad_norm": 1.7754874229431152, "learning_rate": 6.328049339925582e-05, "loss": 0.9512, "step": 7605 }, { "epoch": 0.3801198801198801, "grad_norm": 1.8163813352584839, "learning_rate": 6.325500790050461e-05, "loss": 1.0567, "step": 7610 }, { "epoch": 0.3803696303696304, "grad_norm": 2.15447735786438, "learning_rate": 6.322952240175341e-05, "loss": 1.0911, "step": 7615 }, { "epoch": 0.3806193806193806, "grad_norm": 2.8628618717193604, "learning_rate": 6.320403690300219e-05, "loss": 1.0298, "step": 7620 }, { "epoch": 0.3808691308691309, "grad_norm": 1.8116693496704102, "learning_rate": 6.317855140425099e-05, "loss": 1.0524, "step": 7625 }, { "epoch": 0.3811188811188811, "grad_norm": 2.1319994926452637, "learning_rate": 6.315306590549978e-05, "loss": 0.9624, "step": 7630 }, { "epoch": 0.38136863136863136, "grad_norm": 2.084681272506714, "learning_rate": 6.312758040674856e-05, "loss": 1.0823, "step": 7635 }, { "epoch": 0.38161838161838163, "grad_norm": 2.0021917819976807, "learning_rate": 6.310209490799736e-05, "loss": 1.0351, "step": 7640 }, { "epoch": 0.38186813186813184, "grad_norm": 1.5883678197860718, "learning_rate": 6.307660940924615e-05, "loss": 1.0615, "step": 7645 }, { "epoch": 0.3821178821178821, "grad_norm": 1.8104307651519775, "learning_rate": 6.305112391049493e-05, "loss": 1.0148, "step": 7650 }, { "epoch": 0.3823676323676324, "grad_norm": 2.011185646057129, "learning_rate": 6.302563841174373e-05, "loss": 1.0591, "step": 7655 }, { "epoch": 0.3826173826173826, "grad_norm": 2.12699031829834, "learning_rate": 6.30001529129925e-05, "loss": 1.0402, "step": 7660 }, { "epoch": 0.38286713286713286, "grad_norm": 1.6288317441940308, "learning_rate": 6.297466741424129e-05, "loss": 1.041, "step": 7665 }, { "epoch": 0.38311688311688313, "grad_norm": 2.1979570388793945, "learning_rate": 6.294918191549009e-05, "loss": 1.0802, "step": 7670 }, { "epoch": 0.38336663336663335, "grad_norm": 1.958606481552124, "learning_rate": 6.292369641673887e-05, "loss": 1.0622, "step": 7675 }, { "epoch": 0.3836163836163836, "grad_norm": 1.7825753688812256, "learning_rate": 6.289821091798767e-05, "loss": 1.0359, "step": 7680 }, { "epoch": 0.3838661338661339, "grad_norm": 1.7017152309417725, "learning_rate": 6.287272541923646e-05, "loss": 1.0338, "step": 7685 }, { "epoch": 0.3841158841158841, "grad_norm": 2.1158463954925537, "learning_rate": 6.284723992048524e-05, "loss": 1.0409, "step": 7690 }, { "epoch": 0.38436563436563437, "grad_norm": 1.7794309854507446, "learning_rate": 6.282175442173404e-05, "loss": 1.139, "step": 7695 }, { "epoch": 0.38461538461538464, "grad_norm": 1.6573983430862427, "learning_rate": 6.279626892298283e-05, "loss": 1.0766, "step": 7700 }, { "epoch": 0.38486513486513485, "grad_norm": 2.311091661453247, "learning_rate": 6.277078342423161e-05, "loss": 1.0999, "step": 7705 }, { "epoch": 0.3851148851148851, "grad_norm": 1.7011609077453613, "learning_rate": 6.274529792548041e-05, "loss": 0.9487, "step": 7710 }, { "epoch": 0.3853646353646354, "grad_norm": 1.9077997207641602, "learning_rate": 6.27198124267292e-05, "loss": 1.0345, "step": 7715 }, { "epoch": 0.3856143856143856, "grad_norm": 2.0967483520507812, "learning_rate": 6.269432692797798e-05, "loss": 0.9254, "step": 7720 }, { "epoch": 0.38586413586413587, "grad_norm": 1.6228994131088257, "learning_rate": 6.266884142922678e-05, "loss": 1.0503, "step": 7725 }, { "epoch": 0.38611388611388614, "grad_norm": 1.7925326824188232, "learning_rate": 6.264335593047557e-05, "loss": 1.0375, "step": 7730 }, { "epoch": 0.38636363636363635, "grad_norm": 1.6184121370315552, "learning_rate": 6.261787043172435e-05, "loss": 1.0827, "step": 7735 }, { "epoch": 0.3866133866133866, "grad_norm": 1.8481411933898926, "learning_rate": 6.259238493297314e-05, "loss": 0.9722, "step": 7740 }, { "epoch": 0.38686313686313684, "grad_norm": 2.7838218212127686, "learning_rate": 6.256689943422192e-05, "loss": 1.0387, "step": 7745 }, { "epoch": 0.3871128871128871, "grad_norm": 2.070068836212158, "learning_rate": 6.254141393547072e-05, "loss": 0.9594, "step": 7750 }, { "epoch": 0.3873626373626374, "grad_norm": 1.8007439374923706, "learning_rate": 6.251592843671951e-05, "loss": 1.0842, "step": 7755 }, { "epoch": 0.3876123876123876, "grad_norm": 1.466071605682373, "learning_rate": 6.249044293796829e-05, "loss": 0.9498, "step": 7760 }, { "epoch": 0.38786213786213786, "grad_norm": 1.533583402633667, "learning_rate": 6.246495743921709e-05, "loss": 1.0614, "step": 7765 }, { "epoch": 0.3881118881118881, "grad_norm": 1.8746877908706665, "learning_rate": 6.243947194046588e-05, "loss": 1.103, "step": 7770 }, { "epoch": 0.38836163836163834, "grad_norm": 1.4828280210494995, "learning_rate": 6.241398644171466e-05, "loss": 1.0447, "step": 7775 }, { "epoch": 0.3886113886113886, "grad_norm": 1.7378987073898315, "learning_rate": 6.238850094296346e-05, "loss": 1.0367, "step": 7780 }, { "epoch": 0.3888611388611389, "grad_norm": 1.5251336097717285, "learning_rate": 6.236301544421225e-05, "loss": 1.1013, "step": 7785 }, { "epoch": 0.3891108891108891, "grad_norm": 1.7578907012939453, "learning_rate": 6.233752994546103e-05, "loss": 1.1633, "step": 7790 }, { "epoch": 0.38936063936063936, "grad_norm": 1.7701550722122192, "learning_rate": 6.231204444670983e-05, "loss": 1.0233, "step": 7795 }, { "epoch": 0.38961038961038963, "grad_norm": 1.8438507318496704, "learning_rate": 6.228655894795862e-05, "loss": 0.9307, "step": 7800 }, { "epoch": 0.38986013986013984, "grad_norm": 1.9506025314331055, "learning_rate": 6.22610734492074e-05, "loss": 1.1365, "step": 7805 }, { "epoch": 0.3901098901098901, "grad_norm": 2.4201154708862305, "learning_rate": 6.22355879504562e-05, "loss": 0.986, "step": 7810 }, { "epoch": 0.3903596403596404, "grad_norm": 1.834039568901062, "learning_rate": 6.221010245170499e-05, "loss": 1.0088, "step": 7815 }, { "epoch": 0.3906093906093906, "grad_norm": 1.9000381231307983, "learning_rate": 6.218461695295377e-05, "loss": 1.146, "step": 7820 }, { "epoch": 0.39085914085914086, "grad_norm": 1.8220922946929932, "learning_rate": 6.215913145420256e-05, "loss": 0.9635, "step": 7825 }, { "epoch": 0.39110889110889113, "grad_norm": 2.287177085876465, "learning_rate": 6.213364595545134e-05, "loss": 1.0335, "step": 7830 }, { "epoch": 0.39135864135864135, "grad_norm": 2.1554508209228516, "learning_rate": 6.210816045670014e-05, "loss": 1.0826, "step": 7835 }, { "epoch": 0.3916083916083916, "grad_norm": 2.184490442276001, "learning_rate": 6.208267495794893e-05, "loss": 1.0526, "step": 7840 }, { "epoch": 0.3918581418581419, "grad_norm": 2.066262722015381, "learning_rate": 6.205718945919771e-05, "loss": 1.0298, "step": 7845 }, { "epoch": 0.3921078921078921, "grad_norm": 2.019073009490967, "learning_rate": 6.203170396044651e-05, "loss": 1.0007, "step": 7850 }, { "epoch": 0.39235764235764237, "grad_norm": 1.9522299766540527, "learning_rate": 6.20062184616953e-05, "loss": 1.0254, "step": 7855 }, { "epoch": 0.3926073926073926, "grad_norm": 2.0275156497955322, "learning_rate": 6.198073296294408e-05, "loss": 1.0382, "step": 7860 }, { "epoch": 0.39285714285714285, "grad_norm": 2.1964187622070312, "learning_rate": 6.195524746419288e-05, "loss": 1.1059, "step": 7865 }, { "epoch": 0.3931068931068931, "grad_norm": 1.92535400390625, "learning_rate": 6.192976196544167e-05, "loss": 1.0114, "step": 7870 }, { "epoch": 0.39335664335664333, "grad_norm": 1.7617707252502441, "learning_rate": 6.190427646669047e-05, "loss": 1.0038, "step": 7875 }, { "epoch": 0.3936063936063936, "grad_norm": 1.7267107963562012, "learning_rate": 6.187879096793925e-05, "loss": 1.0871, "step": 7880 }, { "epoch": 0.39385614385614387, "grad_norm": 2.6043457984924316, "learning_rate": 6.185330546918804e-05, "loss": 1.1022, "step": 7885 }, { "epoch": 0.3941058941058941, "grad_norm": 2.129574775695801, "learning_rate": 6.182781997043683e-05, "loss": 0.9986, "step": 7890 }, { "epoch": 0.39435564435564435, "grad_norm": 1.8685061931610107, "learning_rate": 6.180233447168562e-05, "loss": 0.9963, "step": 7895 }, { "epoch": 0.3946053946053946, "grad_norm": 1.9140383005142212, "learning_rate": 6.177684897293439e-05, "loss": 1.0248, "step": 7900 }, { "epoch": 0.39485514485514484, "grad_norm": 2.406261682510376, "learning_rate": 6.175136347418319e-05, "loss": 0.9303, "step": 7905 }, { "epoch": 0.3951048951048951, "grad_norm": 1.987163782119751, "learning_rate": 6.172587797543198e-05, "loss": 1.0658, "step": 7910 }, { "epoch": 0.3953546453546454, "grad_norm": 1.836574673652649, "learning_rate": 6.170039247668076e-05, "loss": 1.0143, "step": 7915 }, { "epoch": 0.3956043956043956, "grad_norm": 1.780957818031311, "learning_rate": 6.167490697792956e-05, "loss": 1.0193, "step": 7920 }, { "epoch": 0.39585414585414586, "grad_norm": 1.5493755340576172, "learning_rate": 6.164942147917835e-05, "loss": 0.986, "step": 7925 }, { "epoch": 0.3961038961038961, "grad_norm": 1.6459829807281494, "learning_rate": 6.162393598042715e-05, "loss": 0.9927, "step": 7930 }, { "epoch": 0.39635364635364634, "grad_norm": 2.0957529544830322, "learning_rate": 6.159845048167593e-05, "loss": 1.1094, "step": 7935 }, { "epoch": 0.3966033966033966, "grad_norm": 1.8176867961883545, "learning_rate": 6.157296498292472e-05, "loss": 1.0288, "step": 7940 }, { "epoch": 0.3968531468531469, "grad_norm": 1.9073083400726318, "learning_rate": 6.154747948417352e-05, "loss": 0.9895, "step": 7945 }, { "epoch": 0.3971028971028971, "grad_norm": 2.0622963905334473, "learning_rate": 6.15219939854223e-05, "loss": 1.0104, "step": 7950 }, { "epoch": 0.39735264735264736, "grad_norm": 1.7107315063476562, "learning_rate": 6.149650848667109e-05, "loss": 1.0337, "step": 7955 }, { "epoch": 0.39760239760239763, "grad_norm": 1.6694787740707397, "learning_rate": 6.147102298791988e-05, "loss": 1.0795, "step": 7960 }, { "epoch": 0.39785214785214784, "grad_norm": 2.0147619247436523, "learning_rate": 6.144553748916867e-05, "loss": 1.0733, "step": 7965 }, { "epoch": 0.3981018981018981, "grad_norm": 1.906727910041809, "learning_rate": 6.142005199041746e-05, "loss": 1.0243, "step": 7970 }, { "epoch": 0.3983516483516483, "grad_norm": 1.8113924264907837, "learning_rate": 6.139456649166625e-05, "loss": 1.0486, "step": 7975 }, { "epoch": 0.3986013986013986, "grad_norm": 1.93712317943573, "learning_rate": 6.136908099291503e-05, "loss": 1.0585, "step": 7980 }, { "epoch": 0.39885114885114886, "grad_norm": 1.811070203781128, "learning_rate": 6.134359549416383e-05, "loss": 0.9352, "step": 7985 }, { "epoch": 0.3991008991008991, "grad_norm": 2.146756172180176, "learning_rate": 6.131810999541261e-05, "loss": 1.0327, "step": 7990 }, { "epoch": 0.39935064935064934, "grad_norm": 2.057783603668213, "learning_rate": 6.12926244966614e-05, "loss": 1.038, "step": 7995 }, { "epoch": 0.3996003996003996, "grad_norm": 1.571982979774475, "learning_rate": 6.12671389979102e-05, "loss": 0.9791, "step": 8000 }, { "epoch": 0.3998501498501498, "grad_norm": 2.1200053691864014, "learning_rate": 6.124165349915898e-05, "loss": 1.0348, "step": 8005 }, { "epoch": 0.4000999000999001, "grad_norm": 2.313692569732666, "learning_rate": 6.121616800040777e-05, "loss": 1.0839, "step": 8010 }, { "epoch": 0.40034965034965037, "grad_norm": 1.814563274383545, "learning_rate": 6.119068250165656e-05, "loss": 0.978, "step": 8015 }, { "epoch": 0.4005994005994006, "grad_norm": 1.85591459274292, "learning_rate": 6.116519700290535e-05, "loss": 1.0489, "step": 8020 }, { "epoch": 0.40084915084915085, "grad_norm": 2.2633016109466553, "learning_rate": 6.113971150415414e-05, "loss": 1.0459, "step": 8025 }, { "epoch": 0.4010989010989011, "grad_norm": 2.3879554271698, "learning_rate": 6.111422600540293e-05, "loss": 0.972, "step": 8030 }, { "epoch": 0.40134865134865133, "grad_norm": 1.969138741493225, "learning_rate": 6.108874050665172e-05, "loss": 1.0266, "step": 8035 }, { "epoch": 0.4015984015984016, "grad_norm": 1.6464279890060425, "learning_rate": 6.10632550079005e-05, "loss": 1.0165, "step": 8040 }, { "epoch": 0.40184815184815187, "grad_norm": 2.31162428855896, "learning_rate": 6.10377695091493e-05, "loss": 1.0389, "step": 8045 }, { "epoch": 0.4020979020979021, "grad_norm": 2.0711169242858887, "learning_rate": 6.101228401039809e-05, "loss": 0.8539, "step": 8050 }, { "epoch": 0.40234765234765235, "grad_norm": 1.6622858047485352, "learning_rate": 6.098679851164688e-05, "loss": 0.9727, "step": 8055 }, { "epoch": 0.4025974025974026, "grad_norm": 1.7368098497390747, "learning_rate": 6.096131301289566e-05, "loss": 1.0491, "step": 8060 }, { "epoch": 0.40284715284715283, "grad_norm": 1.350246548652649, "learning_rate": 6.093582751414445e-05, "loss": 1.0561, "step": 8065 }, { "epoch": 0.4030969030969031, "grad_norm": 1.858502745628357, "learning_rate": 6.091034201539324e-05, "loss": 1.0042, "step": 8070 }, { "epoch": 0.40334665334665337, "grad_norm": 1.8915588855743408, "learning_rate": 6.088485651664203e-05, "loss": 0.9574, "step": 8075 }, { "epoch": 0.4035964035964036, "grad_norm": 2.140730142593384, "learning_rate": 6.085937101789082e-05, "loss": 1.1253, "step": 8080 }, { "epoch": 0.40384615384615385, "grad_norm": 1.524096131324768, "learning_rate": 6.083388551913961e-05, "loss": 1.0437, "step": 8085 }, { "epoch": 0.40409590409590407, "grad_norm": 1.76891028881073, "learning_rate": 6.08084000203884e-05, "loss": 1.1034, "step": 8090 }, { "epoch": 0.40434565434565434, "grad_norm": 2.307626247406006, "learning_rate": 6.078291452163719e-05, "loss": 1.0578, "step": 8095 }, { "epoch": 0.4045954045954046, "grad_norm": 2.21278977394104, "learning_rate": 6.075742902288598e-05, "loss": 1.0703, "step": 8100 }, { "epoch": 0.4048451548451548, "grad_norm": 2.4168224334716797, "learning_rate": 6.073194352413477e-05, "loss": 1.1163, "step": 8105 }, { "epoch": 0.4050949050949051, "grad_norm": 1.5887829065322876, "learning_rate": 6.070645802538356e-05, "loss": 1.1071, "step": 8110 }, { "epoch": 0.40534465534465536, "grad_norm": 1.9536082744598389, "learning_rate": 6.0680972526632354e-05, "loss": 1.1724, "step": 8115 }, { "epoch": 0.40559440559440557, "grad_norm": 1.9719754457473755, "learning_rate": 6.065548702788114e-05, "loss": 1.0193, "step": 8120 }, { "epoch": 0.40584415584415584, "grad_norm": 1.8917434215545654, "learning_rate": 6.063000152912993e-05, "loss": 1.0047, "step": 8125 }, { "epoch": 0.4060939060939061, "grad_norm": 2.2436978816986084, "learning_rate": 6.0604516030378724e-05, "loss": 1.0708, "step": 8130 }, { "epoch": 0.4063436563436563, "grad_norm": 2.3142213821411133, "learning_rate": 6.057903053162751e-05, "loss": 0.9623, "step": 8135 }, { "epoch": 0.4065934065934066, "grad_norm": 1.6898717880249023, "learning_rate": 6.055354503287629e-05, "loss": 1.0846, "step": 8140 }, { "epoch": 0.40684315684315686, "grad_norm": 2.49845027923584, "learning_rate": 6.052805953412508e-05, "loss": 0.9399, "step": 8145 }, { "epoch": 0.4070929070929071, "grad_norm": 2.0456221103668213, "learning_rate": 6.050257403537387e-05, "loss": 0.952, "step": 8150 }, { "epoch": 0.40734265734265734, "grad_norm": 1.8167829513549805, "learning_rate": 6.0477088536622665e-05, "loss": 1.0339, "step": 8155 }, { "epoch": 0.4075924075924076, "grad_norm": 2.16251802444458, "learning_rate": 6.045160303787145e-05, "loss": 1.0011, "step": 8160 }, { "epoch": 0.4078421578421578, "grad_norm": 2.0490524768829346, "learning_rate": 6.042611753912024e-05, "loss": 1.0251, "step": 8165 }, { "epoch": 0.4080919080919081, "grad_norm": 2.316455125808716, "learning_rate": 6.0400632040369035e-05, "loss": 0.975, "step": 8170 }, { "epoch": 0.40834165834165836, "grad_norm": 1.616011381149292, "learning_rate": 6.037514654161782e-05, "loss": 1.1016, "step": 8175 }, { "epoch": 0.4085914085914086, "grad_norm": 1.6761497259140015, "learning_rate": 6.034966104286661e-05, "loss": 1.0238, "step": 8180 }, { "epoch": 0.40884115884115885, "grad_norm": 2.251830577850342, "learning_rate": 6.0324175544115404e-05, "loss": 0.9885, "step": 8185 }, { "epoch": 0.4090909090909091, "grad_norm": 2.870927333831787, "learning_rate": 6.029869004536419e-05, "loss": 1.0774, "step": 8190 }, { "epoch": 0.40934065934065933, "grad_norm": 1.9341243505477905, "learning_rate": 6.027320454661298e-05, "loss": 1.0805, "step": 8195 }, { "epoch": 0.4095904095904096, "grad_norm": 2.142010450363159, "learning_rate": 6.0247719047861774e-05, "loss": 0.9714, "step": 8200 }, { "epoch": 0.40984015984015987, "grad_norm": 2.069467544555664, "learning_rate": 6.022223354911056e-05, "loss": 1.0579, "step": 8205 }, { "epoch": 0.4100899100899101, "grad_norm": 1.7878400087356567, "learning_rate": 6.019674805035935e-05, "loss": 0.9873, "step": 8210 }, { "epoch": 0.41033966033966035, "grad_norm": 2.2596030235290527, "learning_rate": 6.0171262551608144e-05, "loss": 1.0819, "step": 8215 }, { "epoch": 0.41058941058941056, "grad_norm": 1.9977140426635742, "learning_rate": 6.014577705285692e-05, "loss": 1.0171, "step": 8220 }, { "epoch": 0.41083916083916083, "grad_norm": 2.103902816772461, "learning_rate": 6.0120291554105715e-05, "loss": 1.0083, "step": 8225 }, { "epoch": 0.4110889110889111, "grad_norm": 1.6566181182861328, "learning_rate": 6.00948060553545e-05, "loss": 0.9988, "step": 8230 }, { "epoch": 0.4113386613386613, "grad_norm": 1.7796895503997803, "learning_rate": 6.006932055660329e-05, "loss": 0.9604, "step": 8235 }, { "epoch": 0.4115884115884116, "grad_norm": 1.7037739753723145, "learning_rate": 6.0043835057852084e-05, "loss": 0.9411, "step": 8240 }, { "epoch": 0.41183816183816185, "grad_norm": 1.7678488492965698, "learning_rate": 6.001834955910087e-05, "loss": 1.0381, "step": 8245 }, { "epoch": 0.41208791208791207, "grad_norm": 1.5839358568191528, "learning_rate": 5.999286406034966e-05, "loss": 1.1014, "step": 8250 }, { "epoch": 0.41233766233766234, "grad_norm": 1.6829092502593994, "learning_rate": 5.9967378561598454e-05, "loss": 1.0361, "step": 8255 }, { "epoch": 0.4125874125874126, "grad_norm": 1.7305080890655518, "learning_rate": 5.9941893062847246e-05, "loss": 1.0046, "step": 8260 }, { "epoch": 0.4128371628371628, "grad_norm": 1.903672456741333, "learning_rate": 5.991640756409603e-05, "loss": 0.8486, "step": 8265 }, { "epoch": 0.4130869130869131, "grad_norm": 1.992703914642334, "learning_rate": 5.9890922065344824e-05, "loss": 1.129, "step": 8270 }, { "epoch": 0.41333666333666336, "grad_norm": 1.626824140548706, "learning_rate": 5.9865436566593616e-05, "loss": 1.0557, "step": 8275 }, { "epoch": 0.41358641358641357, "grad_norm": 1.674551010131836, "learning_rate": 5.98399510678424e-05, "loss": 0.9978, "step": 8280 }, { "epoch": 0.41383616383616384, "grad_norm": 1.7303478717803955, "learning_rate": 5.9814465569091194e-05, "loss": 1.0677, "step": 8285 }, { "epoch": 0.4140859140859141, "grad_norm": 1.9046059846878052, "learning_rate": 5.9788980070339986e-05, "loss": 0.9986, "step": 8290 }, { "epoch": 0.4143356643356643, "grad_norm": 1.9615753889083862, "learning_rate": 5.976349457158877e-05, "loss": 1.0333, "step": 8295 }, { "epoch": 0.4145854145854146, "grad_norm": 2.0897624492645264, "learning_rate": 5.973800907283755e-05, "loss": 0.9591, "step": 8300 }, { "epoch": 0.41483516483516486, "grad_norm": 1.7357687950134277, "learning_rate": 5.971252357408634e-05, "loss": 1.0656, "step": 8305 }, { "epoch": 0.4150849150849151, "grad_norm": 2.0188961029052734, "learning_rate": 5.9687038075335134e-05, "loss": 1.0009, "step": 8310 }, { "epoch": 0.41533466533466534, "grad_norm": 2.085343360900879, "learning_rate": 5.9661552576583926e-05, "loss": 0.9269, "step": 8315 }, { "epoch": 0.4155844155844156, "grad_norm": 1.7639555931091309, "learning_rate": 5.963606707783271e-05, "loss": 0.9838, "step": 8320 }, { "epoch": 0.4158341658341658, "grad_norm": 1.5691882371902466, "learning_rate": 5.9610581579081504e-05, "loss": 0.8858, "step": 8325 }, { "epoch": 0.4160839160839161, "grad_norm": 2.127718448638916, "learning_rate": 5.9585096080330296e-05, "loss": 1.0423, "step": 8330 }, { "epoch": 0.4163336663336663, "grad_norm": 1.6688522100448608, "learning_rate": 5.955961058157908e-05, "loss": 0.9015, "step": 8335 }, { "epoch": 0.4165834165834166, "grad_norm": 1.6007344722747803, "learning_rate": 5.9534125082827874e-05, "loss": 1.2232, "step": 8340 }, { "epoch": 0.41683316683316685, "grad_norm": 2.08601713180542, "learning_rate": 5.9508639584076666e-05, "loss": 0.9318, "step": 8345 }, { "epoch": 0.41708291708291706, "grad_norm": 1.6389827728271484, "learning_rate": 5.948315408532545e-05, "loss": 1.0067, "step": 8350 }, { "epoch": 0.41733266733266733, "grad_norm": 1.8807002305984497, "learning_rate": 5.9457668586574244e-05, "loss": 1.0799, "step": 8355 }, { "epoch": 0.4175824175824176, "grad_norm": 1.9952985048294067, "learning_rate": 5.9432183087823036e-05, "loss": 1.1061, "step": 8360 }, { "epoch": 0.4178321678321678, "grad_norm": 1.4867914915084839, "learning_rate": 5.940669758907183e-05, "loss": 0.9479, "step": 8365 }, { "epoch": 0.4180819180819181, "grad_norm": 1.7281126976013184, "learning_rate": 5.938121209032061e-05, "loss": 1.0017, "step": 8370 }, { "epoch": 0.41833166833166835, "grad_norm": 3.4808974266052246, "learning_rate": 5.9355726591569406e-05, "loss": 1.0403, "step": 8375 }, { "epoch": 0.41858141858141856, "grad_norm": 1.7975858449935913, "learning_rate": 5.9330241092818184e-05, "loss": 1.0137, "step": 8380 }, { "epoch": 0.41883116883116883, "grad_norm": 2.714016914367676, "learning_rate": 5.9304755594066976e-05, "loss": 1.063, "step": 8385 }, { "epoch": 0.4190809190809191, "grad_norm": 1.6935707330703735, "learning_rate": 5.927927009531576e-05, "loss": 1.1056, "step": 8390 }, { "epoch": 0.4193306693306693, "grad_norm": 1.726171612739563, "learning_rate": 5.9253784596564554e-05, "loss": 1.0761, "step": 8395 }, { "epoch": 0.4195804195804196, "grad_norm": 2.2127768993377686, "learning_rate": 5.9228299097813346e-05, "loss": 1.005, "step": 8400 }, { "epoch": 0.41983016983016985, "grad_norm": 2.671513795852661, "learning_rate": 5.920281359906213e-05, "loss": 1.068, "step": 8405 }, { "epoch": 0.42007992007992007, "grad_norm": 1.772262692451477, "learning_rate": 5.9177328100310924e-05, "loss": 1.0406, "step": 8410 }, { "epoch": 0.42032967032967034, "grad_norm": 1.6859773397445679, "learning_rate": 5.9151842601559716e-05, "loss": 1.0005, "step": 8415 }, { "epoch": 0.4205794205794206, "grad_norm": 1.8427814245224, "learning_rate": 5.912635710280851e-05, "loss": 1.128, "step": 8420 }, { "epoch": 0.4208291708291708, "grad_norm": 1.892624020576477, "learning_rate": 5.9100871604057293e-05, "loss": 1.1418, "step": 8425 }, { "epoch": 0.4210789210789211, "grad_norm": 1.9597564935684204, "learning_rate": 5.9075386105306086e-05, "loss": 1.0776, "step": 8430 }, { "epoch": 0.42132867132867136, "grad_norm": 1.8179426193237305, "learning_rate": 5.904990060655488e-05, "loss": 1.0025, "step": 8435 }, { "epoch": 0.42157842157842157, "grad_norm": 1.5915203094482422, "learning_rate": 5.902441510780366e-05, "loss": 1.0666, "step": 8440 }, { "epoch": 0.42182817182817184, "grad_norm": 1.6697382926940918, "learning_rate": 5.8998929609052455e-05, "loss": 0.9917, "step": 8445 }, { "epoch": 0.42207792207792205, "grad_norm": 1.8285964727401733, "learning_rate": 5.897344411030125e-05, "loss": 0.9501, "step": 8450 }, { "epoch": 0.4223276723276723, "grad_norm": 2.222431182861328, "learning_rate": 5.894795861155003e-05, "loss": 0.9921, "step": 8455 }, { "epoch": 0.4225774225774226, "grad_norm": 1.7760493755340576, "learning_rate": 5.892247311279881e-05, "loss": 1.0204, "step": 8460 }, { "epoch": 0.4228271728271728, "grad_norm": 2.247835159301758, "learning_rate": 5.8896987614047604e-05, "loss": 1.0199, "step": 8465 }, { "epoch": 0.4230769230769231, "grad_norm": 1.8895570039749146, "learning_rate": 5.8871502115296396e-05, "loss": 0.9723, "step": 8470 }, { "epoch": 0.42332667332667334, "grad_norm": 1.8487448692321777, "learning_rate": 5.884601661654519e-05, "loss": 0.9822, "step": 8475 }, { "epoch": 0.42357642357642356, "grad_norm": 1.6751673221588135, "learning_rate": 5.8820531117793974e-05, "loss": 1.0159, "step": 8480 }, { "epoch": 0.4238261738261738, "grad_norm": 1.7134864330291748, "learning_rate": 5.8795045619042766e-05, "loss": 0.9097, "step": 8485 }, { "epoch": 0.4240759240759241, "grad_norm": 2.0210235118865967, "learning_rate": 5.876956012029156e-05, "loss": 0.942, "step": 8490 }, { "epoch": 0.4243256743256743, "grad_norm": 1.6382828950881958, "learning_rate": 5.874407462154034e-05, "loss": 1.0203, "step": 8495 }, { "epoch": 0.4245754245754246, "grad_norm": 1.8404662609100342, "learning_rate": 5.8718589122789136e-05, "loss": 0.971, "step": 8500 }, { "epoch": 0.42482517482517484, "grad_norm": 1.7030246257781982, "learning_rate": 5.869310362403793e-05, "loss": 1.0567, "step": 8505 }, { "epoch": 0.42507492507492506, "grad_norm": 2.131126642227173, "learning_rate": 5.866761812528671e-05, "loss": 1.0481, "step": 8510 }, { "epoch": 0.4253246753246753, "grad_norm": 1.468891978263855, "learning_rate": 5.8642132626535505e-05, "loss": 0.921, "step": 8515 }, { "epoch": 0.4255744255744256, "grad_norm": 1.4905483722686768, "learning_rate": 5.86166471277843e-05, "loss": 1.1199, "step": 8520 }, { "epoch": 0.4258241758241758, "grad_norm": 1.8215348720550537, "learning_rate": 5.859116162903309e-05, "loss": 0.9497, "step": 8525 }, { "epoch": 0.4260739260739261, "grad_norm": 2.20866060256958, "learning_rate": 5.8565676130281875e-05, "loss": 0.9973, "step": 8530 }, { "epoch": 0.42632367632367635, "grad_norm": 2.3619213104248047, "learning_rate": 5.854019063153067e-05, "loss": 1.0469, "step": 8535 }, { "epoch": 0.42657342657342656, "grad_norm": 1.7422620058059692, "learning_rate": 5.8514705132779446e-05, "loss": 1.0353, "step": 8540 }, { "epoch": 0.42682317682317683, "grad_norm": 1.9298968315124512, "learning_rate": 5.848921963402824e-05, "loss": 1.0891, "step": 8545 }, { "epoch": 0.4270729270729271, "grad_norm": 1.591360092163086, "learning_rate": 5.8463734135277023e-05, "loss": 1.0739, "step": 8550 }, { "epoch": 0.4273226773226773, "grad_norm": 2.287031888961792, "learning_rate": 5.8438248636525816e-05, "loss": 1.0104, "step": 8555 }, { "epoch": 0.4275724275724276, "grad_norm": 1.7210994958877563, "learning_rate": 5.841276313777461e-05, "loss": 1.0318, "step": 8560 }, { "epoch": 0.4278221778221778, "grad_norm": 1.493420958518982, "learning_rate": 5.83872776390234e-05, "loss": 1.1245, "step": 8565 }, { "epoch": 0.42807192807192807, "grad_norm": 2.3458189964294434, "learning_rate": 5.8361792140272185e-05, "loss": 1.1271, "step": 8570 }, { "epoch": 0.42832167832167833, "grad_norm": 2.1648874282836914, "learning_rate": 5.833630664152098e-05, "loss": 1.0832, "step": 8575 }, { "epoch": 0.42857142857142855, "grad_norm": 1.77560293674469, "learning_rate": 5.831082114276977e-05, "loss": 1.0286, "step": 8580 }, { "epoch": 0.4288211788211788, "grad_norm": 1.765061378479004, "learning_rate": 5.8285335644018555e-05, "loss": 1.0693, "step": 8585 }, { "epoch": 0.4290709290709291, "grad_norm": 3.1118929386138916, "learning_rate": 5.825985014526735e-05, "loss": 0.9921, "step": 8590 }, { "epoch": 0.4293206793206793, "grad_norm": 1.3973021507263184, "learning_rate": 5.823436464651614e-05, "loss": 0.9849, "step": 8595 }, { "epoch": 0.42957042957042957, "grad_norm": 1.8138960599899292, "learning_rate": 5.8208879147764925e-05, "loss": 1.0163, "step": 8600 }, { "epoch": 0.42982017982017984, "grad_norm": 2.1922130584716797, "learning_rate": 5.818339364901372e-05, "loss": 0.9601, "step": 8605 }, { "epoch": 0.43006993006993005, "grad_norm": 2.0836856365203857, "learning_rate": 5.815790815026251e-05, "loss": 0.9944, "step": 8610 }, { "epoch": 0.4303196803196803, "grad_norm": 2.551135778427124, "learning_rate": 5.8132422651511295e-05, "loss": 0.92, "step": 8615 }, { "epoch": 0.4305694305694306, "grad_norm": 2.286620616912842, "learning_rate": 5.810693715276008e-05, "loss": 0.9764, "step": 8620 }, { "epoch": 0.4308191808191808, "grad_norm": 2.0521771907806396, "learning_rate": 5.8081451654008866e-05, "loss": 0.9956, "step": 8625 }, { "epoch": 0.43106893106893107, "grad_norm": 1.9323019981384277, "learning_rate": 5.805596615525766e-05, "loss": 1.0467, "step": 8630 }, { "epoch": 0.43131868131868134, "grad_norm": 1.6725544929504395, "learning_rate": 5.803048065650645e-05, "loss": 1.0167, "step": 8635 }, { "epoch": 0.43156843156843155, "grad_norm": 1.922048568725586, "learning_rate": 5.8004995157755235e-05, "loss": 1.0773, "step": 8640 }, { "epoch": 0.4318181818181818, "grad_norm": 1.8454574346542358, "learning_rate": 5.797950965900403e-05, "loss": 1.0164, "step": 8645 }, { "epoch": 0.4320679320679321, "grad_norm": 1.6173814535140991, "learning_rate": 5.795402416025282e-05, "loss": 1.0424, "step": 8650 }, { "epoch": 0.4323176823176823, "grad_norm": 2.339289903640747, "learning_rate": 5.7928538661501605e-05, "loss": 1.1071, "step": 8655 }, { "epoch": 0.4325674325674326, "grad_norm": 1.9890997409820557, "learning_rate": 5.79030531627504e-05, "loss": 1.1168, "step": 8660 }, { "epoch": 0.43281718281718284, "grad_norm": 1.4483760595321655, "learning_rate": 5.787756766399919e-05, "loss": 1.1004, "step": 8665 }, { "epoch": 0.43306693306693306, "grad_norm": 1.6572809219360352, "learning_rate": 5.785208216524798e-05, "loss": 1.069, "step": 8670 }, { "epoch": 0.4333166833166833, "grad_norm": 2.0435497760772705, "learning_rate": 5.782659666649677e-05, "loss": 1.0359, "step": 8675 }, { "epoch": 0.43356643356643354, "grad_norm": 1.5905063152313232, "learning_rate": 5.780111116774556e-05, "loss": 1.0012, "step": 8680 }, { "epoch": 0.4338161838161838, "grad_norm": 1.8773237466812134, "learning_rate": 5.777562566899435e-05, "loss": 1.0605, "step": 8685 }, { "epoch": 0.4340659340659341, "grad_norm": 2.1612396240234375, "learning_rate": 5.775014017024314e-05, "loss": 1.0955, "step": 8690 }, { "epoch": 0.4343156843156843, "grad_norm": 1.469048261642456, "learning_rate": 5.772465467149193e-05, "loss": 1.0499, "step": 8695 }, { "epoch": 0.43456543456543456, "grad_norm": 2.2405614852905273, "learning_rate": 5.769916917274071e-05, "loss": 1.0375, "step": 8700 }, { "epoch": 0.43481518481518483, "grad_norm": 1.909608006477356, "learning_rate": 5.76736836739895e-05, "loss": 0.9127, "step": 8705 }, { "epoch": 0.43506493506493504, "grad_norm": 1.8946871757507324, "learning_rate": 5.7648198175238285e-05, "loss": 1.0266, "step": 8710 }, { "epoch": 0.4353146853146853, "grad_norm": 2.053183078765869, "learning_rate": 5.762271267648708e-05, "loss": 1.0387, "step": 8715 }, { "epoch": 0.4355644355644356, "grad_norm": 1.9851399660110474, "learning_rate": 5.759722717773587e-05, "loss": 0.967, "step": 8720 }, { "epoch": 0.4358141858141858, "grad_norm": 3.311208724975586, "learning_rate": 5.757174167898466e-05, "loss": 1.1796, "step": 8725 }, { "epoch": 0.43606393606393606, "grad_norm": 1.8107490539550781, "learning_rate": 5.754625618023345e-05, "loss": 1.1476, "step": 8730 }, { "epoch": 0.43631368631368633, "grad_norm": 2.1709601879119873, "learning_rate": 5.752077068148224e-05, "loss": 0.9429, "step": 8735 }, { "epoch": 0.43656343656343655, "grad_norm": 1.9626538753509521, "learning_rate": 5.749528518273103e-05, "loss": 1.008, "step": 8740 }, { "epoch": 0.4368131868131868, "grad_norm": 2.407304048538208, "learning_rate": 5.746979968397982e-05, "loss": 0.9624, "step": 8745 }, { "epoch": 0.4370629370629371, "grad_norm": 1.8312275409698486, "learning_rate": 5.744431418522861e-05, "loss": 1.0594, "step": 8750 }, { "epoch": 0.4373126873126873, "grad_norm": 1.9834412336349487, "learning_rate": 5.74188286864774e-05, "loss": 0.9931, "step": 8755 }, { "epoch": 0.43756243756243757, "grad_norm": 1.6721858978271484, "learning_rate": 5.739334318772619e-05, "loss": 0.993, "step": 8760 }, { "epoch": 0.43781218781218784, "grad_norm": 2.2042829990386963, "learning_rate": 5.736785768897498e-05, "loss": 1.0361, "step": 8765 }, { "epoch": 0.43806193806193805, "grad_norm": 2.0658700466156006, "learning_rate": 5.734237219022377e-05, "loss": 1.0625, "step": 8770 }, { "epoch": 0.4383116883116883, "grad_norm": 1.648266077041626, "learning_rate": 5.731688669147256e-05, "loss": 1.0664, "step": 8775 }, { "epoch": 0.4385614385614386, "grad_norm": 1.8135478496551514, "learning_rate": 5.729140119272134e-05, "loss": 0.9951, "step": 8780 }, { "epoch": 0.4388111888111888, "grad_norm": 2.208460807800293, "learning_rate": 5.726591569397013e-05, "loss": 0.9043, "step": 8785 }, { "epoch": 0.43906093906093907, "grad_norm": 2.123093366622925, "learning_rate": 5.724043019521892e-05, "loss": 1.0369, "step": 8790 }, { "epoch": 0.4393106893106893, "grad_norm": 2.6139187812805176, "learning_rate": 5.721494469646771e-05, "loss": 0.9169, "step": 8795 }, { "epoch": 0.43956043956043955, "grad_norm": 1.8385754823684692, "learning_rate": 5.71894591977165e-05, "loss": 0.9778, "step": 8800 }, { "epoch": 0.4398101898101898, "grad_norm": 2.103018283843994, "learning_rate": 5.716397369896529e-05, "loss": 1.204, "step": 8805 }, { "epoch": 0.44005994005994004, "grad_norm": 1.9360520839691162, "learning_rate": 5.713848820021408e-05, "loss": 1.1208, "step": 8810 }, { "epoch": 0.4403096903096903, "grad_norm": 1.6177865266799927, "learning_rate": 5.711300270146287e-05, "loss": 1.0134, "step": 8815 }, { "epoch": 0.4405594405594406, "grad_norm": 1.837673306465149, "learning_rate": 5.708751720271166e-05, "loss": 1.0796, "step": 8820 }, { "epoch": 0.4408091908091908, "grad_norm": 2.199380874633789, "learning_rate": 5.706203170396045e-05, "loss": 0.9652, "step": 8825 }, { "epoch": 0.44105894105894106, "grad_norm": 1.811627745628357, "learning_rate": 5.703654620520924e-05, "loss": 0.9977, "step": 8830 }, { "epoch": 0.4413086913086913, "grad_norm": 1.9669990539550781, "learning_rate": 5.701106070645803e-05, "loss": 1.0783, "step": 8835 }, { "epoch": 0.44155844155844154, "grad_norm": 1.954996943473816, "learning_rate": 5.698557520770682e-05, "loss": 1.0151, "step": 8840 }, { "epoch": 0.4418081918081918, "grad_norm": 2.263460159301758, "learning_rate": 5.696008970895561e-05, "loss": 0.962, "step": 8845 }, { "epoch": 0.4420579420579421, "grad_norm": 2.9194424152374268, "learning_rate": 5.69346042102044e-05, "loss": 1.1043, "step": 8850 }, { "epoch": 0.4423076923076923, "grad_norm": 2.2681126594543457, "learning_rate": 5.690911871145319e-05, "loss": 1.1515, "step": 8855 }, { "epoch": 0.44255744255744256, "grad_norm": 2.117429733276367, "learning_rate": 5.688363321270197e-05, "loss": 0.9509, "step": 8860 }, { "epoch": 0.44280719280719283, "grad_norm": 1.6960341930389404, "learning_rate": 5.685814771395076e-05, "loss": 1.1028, "step": 8865 }, { "epoch": 0.44305694305694304, "grad_norm": 1.9639930725097656, "learning_rate": 5.683266221519955e-05, "loss": 0.9601, "step": 8870 }, { "epoch": 0.4433066933066933, "grad_norm": 1.7155579328536987, "learning_rate": 5.680717671644834e-05, "loss": 0.9976, "step": 8875 }, { "epoch": 0.4435564435564436, "grad_norm": 2.0047292709350586, "learning_rate": 5.678169121769713e-05, "loss": 1.0631, "step": 8880 }, { "epoch": 0.4438061938061938, "grad_norm": 1.799039363861084, "learning_rate": 5.6756205718945923e-05, "loss": 0.9843, "step": 8885 }, { "epoch": 0.44405594405594406, "grad_norm": 1.353966236114502, "learning_rate": 5.673072022019471e-05, "loss": 0.9337, "step": 8890 }, { "epoch": 0.44430569430569433, "grad_norm": 2.321974515914917, "learning_rate": 5.67052347214435e-05, "loss": 0.9146, "step": 8895 }, { "epoch": 0.44455544455544455, "grad_norm": 1.9653019905090332, "learning_rate": 5.667974922269229e-05, "loss": 1.0586, "step": 8900 }, { "epoch": 0.4448051948051948, "grad_norm": 1.4947035312652588, "learning_rate": 5.665426372394108e-05, "loss": 1.021, "step": 8905 }, { "epoch": 0.44505494505494503, "grad_norm": 1.8535796403884888, "learning_rate": 5.662877822518987e-05, "loss": 0.9861, "step": 8910 }, { "epoch": 0.4453046953046953, "grad_norm": 2.0894110202789307, "learning_rate": 5.660329272643866e-05, "loss": 0.9727, "step": 8915 }, { "epoch": 0.44555444555444557, "grad_norm": 2.1427135467529297, "learning_rate": 5.657780722768745e-05, "loss": 0.9483, "step": 8920 }, { "epoch": 0.4458041958041958, "grad_norm": 2.355966091156006, "learning_rate": 5.655232172893624e-05, "loss": 1.0602, "step": 8925 }, { "epoch": 0.44605394605394605, "grad_norm": 1.7807228565216064, "learning_rate": 5.652683623018503e-05, "loss": 1.0324, "step": 8930 }, { "epoch": 0.4463036963036963, "grad_norm": 2.0543699264526367, "learning_rate": 5.6501350731433825e-05, "loss": 1.0361, "step": 8935 }, { "epoch": 0.44655344655344653, "grad_norm": 1.6615523099899292, "learning_rate": 5.6475865232682604e-05, "loss": 0.9834, "step": 8940 }, { "epoch": 0.4468031968031968, "grad_norm": 1.5787633657455444, "learning_rate": 5.645037973393139e-05, "loss": 0.9857, "step": 8945 }, { "epoch": 0.44705294705294707, "grad_norm": 1.7049851417541504, "learning_rate": 5.642489423518018e-05, "loss": 1.0416, "step": 8950 }, { "epoch": 0.4473026973026973, "grad_norm": 2.1409668922424316, "learning_rate": 5.639940873642897e-05, "loss": 1.0292, "step": 8955 }, { "epoch": 0.44755244755244755, "grad_norm": 2.2022745609283447, "learning_rate": 5.637392323767776e-05, "loss": 0.9766, "step": 8960 }, { "epoch": 0.4478021978021978, "grad_norm": 1.7518359422683716, "learning_rate": 5.634843773892655e-05, "loss": 0.9826, "step": 8965 }, { "epoch": 0.44805194805194803, "grad_norm": 1.5120329856872559, "learning_rate": 5.632295224017534e-05, "loss": 1.1363, "step": 8970 }, { "epoch": 0.4483016983016983, "grad_norm": 1.7536566257476807, "learning_rate": 5.629746674142413e-05, "loss": 1.0544, "step": 8975 }, { "epoch": 0.4485514485514486, "grad_norm": 1.9257601499557495, "learning_rate": 5.627198124267292e-05, "loss": 1.067, "step": 8980 }, { "epoch": 0.4488011988011988, "grad_norm": 1.8563898801803589, "learning_rate": 5.624649574392171e-05, "loss": 1.1123, "step": 8985 }, { "epoch": 0.44905094905094906, "grad_norm": 2.2734243869781494, "learning_rate": 5.6221010245170505e-05, "loss": 1.0583, "step": 8990 }, { "epoch": 0.4493006993006993, "grad_norm": 2.1038169860839844, "learning_rate": 5.619552474641929e-05, "loss": 1.0462, "step": 8995 }, { "epoch": 0.44955044955044954, "grad_norm": 1.7338979244232178, "learning_rate": 5.617003924766808e-05, "loss": 1.0255, "step": 9000 }, { "epoch": 0.4498001998001998, "grad_norm": 1.8403189182281494, "learning_rate": 5.6144553748916875e-05, "loss": 1.0535, "step": 9005 }, { "epoch": 0.4500499500499501, "grad_norm": 2.0320918560028076, "learning_rate": 5.611906825016566e-05, "loss": 1.0853, "step": 9010 }, { "epoch": 0.4502997002997003, "grad_norm": 2.086784839630127, "learning_rate": 5.609358275141445e-05, "loss": 0.9748, "step": 9015 }, { "epoch": 0.45054945054945056, "grad_norm": 1.9181675910949707, "learning_rate": 5.606809725266323e-05, "loss": 0.9972, "step": 9020 }, { "epoch": 0.45079920079920077, "grad_norm": 2.073082208633423, "learning_rate": 5.604261175391202e-05, "loss": 1.0968, "step": 9025 }, { "epoch": 0.45104895104895104, "grad_norm": 1.70656156539917, "learning_rate": 5.6017126255160815e-05, "loss": 1.0431, "step": 9030 }, { "epoch": 0.4512987012987013, "grad_norm": 3.370615243911743, "learning_rate": 5.59916407564096e-05, "loss": 1.0099, "step": 9035 }, { "epoch": 0.4515484515484515, "grad_norm": 2.32547664642334, "learning_rate": 5.596615525765839e-05, "loss": 1.0379, "step": 9040 }, { "epoch": 0.4517982017982018, "grad_norm": 2.4206790924072266, "learning_rate": 5.5940669758907185e-05, "loss": 1.1314, "step": 9045 }, { "epoch": 0.45204795204795206, "grad_norm": 1.8206801414489746, "learning_rate": 5.591518426015597e-05, "loss": 0.9944, "step": 9050 }, { "epoch": 0.4522977022977023, "grad_norm": 2.1931114196777344, "learning_rate": 5.588969876140476e-05, "loss": 1.0763, "step": 9055 }, { "epoch": 0.45254745254745254, "grad_norm": 2.0819082260131836, "learning_rate": 5.5864213262653555e-05, "loss": 1.0379, "step": 9060 }, { "epoch": 0.4527972027972028, "grad_norm": 1.6131161451339722, "learning_rate": 5.583872776390234e-05, "loss": 0.933, "step": 9065 }, { "epoch": 0.453046953046953, "grad_norm": 1.8526209592819214, "learning_rate": 5.581324226515113e-05, "loss": 1.0519, "step": 9070 }, { "epoch": 0.4532967032967033, "grad_norm": 1.983704686164856, "learning_rate": 5.5787756766399925e-05, "loss": 1.122, "step": 9075 }, { "epoch": 0.45354645354645357, "grad_norm": 2.206155776977539, "learning_rate": 5.576227126764872e-05, "loss": 0.9484, "step": 9080 }, { "epoch": 0.4537962037962038, "grad_norm": 2.108036756515503, "learning_rate": 5.57367857688975e-05, "loss": 0.966, "step": 9085 }, { "epoch": 0.45404595404595405, "grad_norm": 1.7380211353302002, "learning_rate": 5.5711300270146294e-05, "loss": 1.0359, "step": 9090 }, { "epoch": 0.4542957042957043, "grad_norm": 2.774899959564209, "learning_rate": 5.568581477139509e-05, "loss": 1.0866, "step": 9095 }, { "epoch": 0.45454545454545453, "grad_norm": 1.9622609615325928, "learning_rate": 5.5660329272643865e-05, "loss": 1.0635, "step": 9100 }, { "epoch": 0.4547952047952048, "grad_norm": 1.928787350654602, "learning_rate": 5.563484377389265e-05, "loss": 1.1644, "step": 9105 }, { "epoch": 0.45504495504495507, "grad_norm": 1.9837929010391235, "learning_rate": 5.560935827514144e-05, "loss": 1.0163, "step": 9110 }, { "epoch": 0.4552947052947053, "grad_norm": 2.009246349334717, "learning_rate": 5.5583872776390235e-05, "loss": 1.0191, "step": 9115 }, { "epoch": 0.45554445554445555, "grad_norm": 2.009209632873535, "learning_rate": 5.555838727763902e-05, "loss": 1.0321, "step": 9120 }, { "epoch": 0.4557942057942058, "grad_norm": 2.0588490962982178, "learning_rate": 5.553290177888781e-05, "loss": 1.0827, "step": 9125 }, { "epoch": 0.45604395604395603, "grad_norm": 1.6079925298690796, "learning_rate": 5.5507416280136605e-05, "loss": 0.9829, "step": 9130 }, { "epoch": 0.4562937062937063, "grad_norm": 2.07023549079895, "learning_rate": 5.54819307813854e-05, "loss": 1.1175, "step": 9135 }, { "epoch": 0.4565434565434565, "grad_norm": 1.790460467338562, "learning_rate": 5.545644528263418e-05, "loss": 1.0102, "step": 9140 }, { "epoch": 0.4567932067932068, "grad_norm": 2.209059715270996, "learning_rate": 5.5430959783882975e-05, "loss": 1.084, "step": 9145 }, { "epoch": 0.45704295704295705, "grad_norm": 1.4967703819274902, "learning_rate": 5.540547428513177e-05, "loss": 1.0831, "step": 9150 }, { "epoch": 0.45729270729270727, "grad_norm": 1.515915036201477, "learning_rate": 5.537998878638055e-05, "loss": 1.0491, "step": 9155 }, { "epoch": 0.45754245754245754, "grad_norm": 1.9706919193267822, "learning_rate": 5.5354503287629344e-05, "loss": 1.0357, "step": 9160 }, { "epoch": 0.4577922077922078, "grad_norm": 2.3127715587615967, "learning_rate": 5.5329017788878137e-05, "loss": 1.0145, "step": 9165 }, { "epoch": 0.458041958041958, "grad_norm": 2.0604145526885986, "learning_rate": 5.530353229012692e-05, "loss": 0.9965, "step": 9170 }, { "epoch": 0.4582917082917083, "grad_norm": 1.7411119937896729, "learning_rate": 5.5278046791375714e-05, "loss": 0.9808, "step": 9175 }, { "epoch": 0.45854145854145856, "grad_norm": 2.569514513015747, "learning_rate": 5.525256129262449e-05, "loss": 1.0047, "step": 9180 }, { "epoch": 0.45879120879120877, "grad_norm": 1.7290107011795044, "learning_rate": 5.5227075793873285e-05, "loss": 1.02, "step": 9185 }, { "epoch": 0.45904095904095904, "grad_norm": 1.7303178310394287, "learning_rate": 5.520159029512208e-05, "loss": 1.0151, "step": 9190 }, { "epoch": 0.4592907092907093, "grad_norm": 1.974257230758667, "learning_rate": 5.517610479637086e-05, "loss": 0.9595, "step": 9195 }, { "epoch": 0.4595404595404595, "grad_norm": 2.59578800201416, "learning_rate": 5.5150619297619655e-05, "loss": 1.0493, "step": 9200 }, { "epoch": 0.4597902097902098, "grad_norm": 1.7303261756896973, "learning_rate": 5.512513379886845e-05, "loss": 0.9425, "step": 9205 }, { "epoch": 0.46003996003996006, "grad_norm": 1.5229170322418213, "learning_rate": 5.509964830011723e-05, "loss": 0.9954, "step": 9210 }, { "epoch": 0.4602897102897103, "grad_norm": 1.685680627822876, "learning_rate": 5.5074162801366025e-05, "loss": 0.9325, "step": 9215 }, { "epoch": 0.46053946053946054, "grad_norm": 1.845762848854065, "learning_rate": 5.504867730261482e-05, "loss": 1.1016, "step": 9220 }, { "epoch": 0.4607892107892108, "grad_norm": 2.2828259468078613, "learning_rate": 5.50231918038636e-05, "loss": 0.9881, "step": 9225 }, { "epoch": 0.461038961038961, "grad_norm": 2.262589454650879, "learning_rate": 5.4997706305112394e-05, "loss": 1.0241, "step": 9230 }, { "epoch": 0.4612887112887113, "grad_norm": 2.320115089416504, "learning_rate": 5.4972220806361186e-05, "loss": 0.9562, "step": 9235 }, { "epoch": 0.46153846153846156, "grad_norm": 1.7466362714767456, "learning_rate": 5.494673530760998e-05, "loss": 0.9901, "step": 9240 }, { "epoch": 0.4617882117882118, "grad_norm": 1.8919693231582642, "learning_rate": 5.4921249808858764e-05, "loss": 0.9799, "step": 9245 }, { "epoch": 0.46203796203796205, "grad_norm": 1.9142965078353882, "learning_rate": 5.4895764310107556e-05, "loss": 1.0359, "step": 9250 }, { "epoch": 0.46228771228771226, "grad_norm": 1.7192375659942627, "learning_rate": 5.487027881135635e-05, "loss": 0.9716, "step": 9255 }, { "epoch": 0.46253746253746253, "grad_norm": 1.8106921911239624, "learning_rate": 5.484479331260513e-05, "loss": 1.1944, "step": 9260 }, { "epoch": 0.4627872127872128, "grad_norm": 2.0583062171936035, "learning_rate": 5.481930781385391e-05, "loss": 1.0375, "step": 9265 }, { "epoch": 0.463036963036963, "grad_norm": 1.9468348026275635, "learning_rate": 5.4793822315102705e-05, "loss": 0.9898, "step": 9270 }, { "epoch": 0.4632867132867133, "grad_norm": 1.8039051294326782, "learning_rate": 5.47683368163515e-05, "loss": 1.1212, "step": 9275 }, { "epoch": 0.46353646353646355, "grad_norm": 2.0479846000671387, "learning_rate": 5.474285131760028e-05, "loss": 1.0976, "step": 9280 }, { "epoch": 0.46378621378621376, "grad_norm": 2.444497585296631, "learning_rate": 5.4717365818849074e-05, "loss": 1.0243, "step": 9285 }, { "epoch": 0.46403596403596403, "grad_norm": 1.6374053955078125, "learning_rate": 5.4691880320097867e-05, "loss": 0.9585, "step": 9290 }, { "epoch": 0.4642857142857143, "grad_norm": 1.8804490566253662, "learning_rate": 5.466639482134666e-05, "loss": 0.9699, "step": 9295 }, { "epoch": 0.4645354645354645, "grad_norm": 2.045616865158081, "learning_rate": 5.4640909322595444e-05, "loss": 1.0318, "step": 9300 }, { "epoch": 0.4647852147852148, "grad_norm": 2.386042833328247, "learning_rate": 5.4615423823844236e-05, "loss": 1.0406, "step": 9305 }, { "epoch": 0.46503496503496505, "grad_norm": 1.7758439779281616, "learning_rate": 5.458993832509303e-05, "loss": 1.0931, "step": 9310 }, { "epoch": 0.46528471528471527, "grad_norm": 1.6942564249038696, "learning_rate": 5.4564452826341814e-05, "loss": 1.0864, "step": 9315 }, { "epoch": 0.46553446553446554, "grad_norm": 1.397141456604004, "learning_rate": 5.4538967327590606e-05, "loss": 0.9717, "step": 9320 }, { "epoch": 0.4657842157842158, "grad_norm": 2.0510904788970947, "learning_rate": 5.45134818288394e-05, "loss": 1.0703, "step": 9325 }, { "epoch": 0.466033966033966, "grad_norm": 2.0366575717926025, "learning_rate": 5.4487996330088184e-05, "loss": 0.9848, "step": 9330 }, { "epoch": 0.4662837162837163, "grad_norm": 2.0513663291931152, "learning_rate": 5.4462510831336976e-05, "loss": 1.0642, "step": 9335 }, { "epoch": 0.46653346653346656, "grad_norm": 1.6523046493530273, "learning_rate": 5.4437025332585755e-05, "loss": 1.0382, "step": 9340 }, { "epoch": 0.46678321678321677, "grad_norm": 2.157517671585083, "learning_rate": 5.441153983383455e-05, "loss": 1.0676, "step": 9345 }, { "epoch": 0.46703296703296704, "grad_norm": 1.7325857877731323, "learning_rate": 5.438605433508334e-05, "loss": 1.1225, "step": 9350 }, { "epoch": 0.4672827172827173, "grad_norm": 2.0683634281158447, "learning_rate": 5.4360568836332124e-05, "loss": 1.0735, "step": 9355 }, { "epoch": 0.4675324675324675, "grad_norm": 1.9098098278045654, "learning_rate": 5.4335083337580916e-05, "loss": 1.0378, "step": 9360 }, { "epoch": 0.4677822177822178, "grad_norm": 1.5465494394302368, "learning_rate": 5.430959783882971e-05, "loss": 1.065, "step": 9365 }, { "epoch": 0.468031968031968, "grad_norm": 1.7016611099243164, "learning_rate": 5.4284112340078494e-05, "loss": 1.0453, "step": 9370 }, { "epoch": 0.4682817182817183, "grad_norm": 1.822651743888855, "learning_rate": 5.4258626841327286e-05, "loss": 0.945, "step": 9375 }, { "epoch": 0.46853146853146854, "grad_norm": 1.9091936349868774, "learning_rate": 5.423314134257608e-05, "loss": 0.9735, "step": 9380 }, { "epoch": 0.46878121878121876, "grad_norm": 2.064131736755371, "learning_rate": 5.4207655843824864e-05, "loss": 1.1112, "step": 9385 }, { "epoch": 0.469030969030969, "grad_norm": 1.8060588836669922, "learning_rate": 5.4182170345073656e-05, "loss": 1.0431, "step": 9390 }, { "epoch": 0.4692807192807193, "grad_norm": 2.4606287479400635, "learning_rate": 5.415668484632245e-05, "loss": 0.9969, "step": 9395 }, { "epoch": 0.4695304695304695, "grad_norm": 1.944982886314392, "learning_rate": 5.413119934757124e-05, "loss": 1.1479, "step": 9400 }, { "epoch": 0.4697802197802198, "grad_norm": 2.4000492095947266, "learning_rate": 5.4105713848820026e-05, "loss": 1.1077, "step": 9405 }, { "epoch": 0.47002997002997005, "grad_norm": 1.8426103591918945, "learning_rate": 5.408022835006882e-05, "loss": 1.0209, "step": 9410 }, { "epoch": 0.47027972027972026, "grad_norm": 2.228710651397705, "learning_rate": 5.405474285131761e-05, "loss": 0.9696, "step": 9415 }, { "epoch": 0.47052947052947053, "grad_norm": 1.7160286903381348, "learning_rate": 5.402925735256639e-05, "loss": 1.0035, "step": 9420 }, { "epoch": 0.4707792207792208, "grad_norm": 2.2011218070983887, "learning_rate": 5.4003771853815174e-05, "loss": 1.0523, "step": 9425 }, { "epoch": 0.471028971028971, "grad_norm": 2.0960075855255127, "learning_rate": 5.3978286355063966e-05, "loss": 0.9146, "step": 9430 }, { "epoch": 0.4712787212787213, "grad_norm": 1.6829379796981812, "learning_rate": 5.395280085631276e-05, "loss": 0.9631, "step": 9435 }, { "epoch": 0.47152847152847155, "grad_norm": 1.899162769317627, "learning_rate": 5.392731535756155e-05, "loss": 1.0664, "step": 9440 }, { "epoch": 0.47177822177822176, "grad_norm": 1.7100518941879272, "learning_rate": 5.3901829858810336e-05, "loss": 1.0561, "step": 9445 }, { "epoch": 0.47202797202797203, "grad_norm": 1.640717625617981, "learning_rate": 5.387634436005913e-05, "loss": 0.9822, "step": 9450 }, { "epoch": 0.4722777222777223, "grad_norm": 2.6145331859588623, "learning_rate": 5.385085886130792e-05, "loss": 0.9774, "step": 9455 }, { "epoch": 0.4725274725274725, "grad_norm": 1.695280909538269, "learning_rate": 5.3825373362556706e-05, "loss": 0.9988, "step": 9460 }, { "epoch": 0.4727772227772228, "grad_norm": 1.7193050384521484, "learning_rate": 5.37998878638055e-05, "loss": 0.9047, "step": 9465 }, { "epoch": 0.47302697302697305, "grad_norm": 1.5755399465560913, "learning_rate": 5.377440236505429e-05, "loss": 1.0589, "step": 9470 }, { "epoch": 0.47327672327672327, "grad_norm": 2.072739362716675, "learning_rate": 5.3748916866303076e-05, "loss": 0.9375, "step": 9475 }, { "epoch": 0.47352647352647353, "grad_norm": 1.7903701066970825, "learning_rate": 5.372343136755187e-05, "loss": 1.0963, "step": 9480 }, { "epoch": 0.4737762237762238, "grad_norm": 2.0294811725616455, "learning_rate": 5.369794586880066e-05, "loss": 1.0108, "step": 9485 }, { "epoch": 0.474025974025974, "grad_norm": 2.0755722522735596, "learning_rate": 5.367246037004945e-05, "loss": 1.0286, "step": 9490 }, { "epoch": 0.4742757242757243, "grad_norm": 2.401132822036743, "learning_rate": 5.364697487129824e-05, "loss": 0.9075, "step": 9495 }, { "epoch": 0.4745254745254745, "grad_norm": 2.4186060428619385, "learning_rate": 5.3621489372547016e-05, "loss": 1.0229, "step": 9500 }, { "epoch": 0.47477522477522477, "grad_norm": 1.8958390951156616, "learning_rate": 5.359600387379581e-05, "loss": 1.1007, "step": 9505 }, { "epoch": 0.47502497502497504, "grad_norm": 2.554835796356201, "learning_rate": 5.35705183750446e-05, "loss": 1.0623, "step": 9510 }, { "epoch": 0.47527472527472525, "grad_norm": 1.636560082435608, "learning_rate": 5.3545032876293386e-05, "loss": 0.9719, "step": 9515 }, { "epoch": 0.4755244755244755, "grad_norm": 2.290989875793457, "learning_rate": 5.351954737754218e-05, "loss": 1.0223, "step": 9520 }, { "epoch": 0.4757742257742258, "grad_norm": 2.2178313732147217, "learning_rate": 5.349406187879097e-05, "loss": 0.972, "step": 9525 }, { "epoch": 0.476023976023976, "grad_norm": 1.7402650117874146, "learning_rate": 5.3468576380039756e-05, "loss": 1.1011, "step": 9530 }, { "epoch": 0.47627372627372627, "grad_norm": 1.8568016290664673, "learning_rate": 5.344309088128855e-05, "loss": 0.9785, "step": 9535 }, { "epoch": 0.47652347652347654, "grad_norm": 1.5964857339859009, "learning_rate": 5.341760538253734e-05, "loss": 1.1564, "step": 9540 }, { "epoch": 0.47677322677322675, "grad_norm": 2.139514684677124, "learning_rate": 5.339211988378613e-05, "loss": 1.0288, "step": 9545 }, { "epoch": 0.477022977022977, "grad_norm": 2.0169742107391357, "learning_rate": 5.336663438503492e-05, "loss": 0.9761, "step": 9550 }, { "epoch": 0.4772727272727273, "grad_norm": 2.1550629138946533, "learning_rate": 5.334114888628371e-05, "loss": 0.9958, "step": 9555 }, { "epoch": 0.4775224775224775, "grad_norm": 2.8084795475006104, "learning_rate": 5.33156633875325e-05, "loss": 1.1117, "step": 9560 }, { "epoch": 0.4777722277722278, "grad_norm": 1.68765127658844, "learning_rate": 5.329017788878129e-05, "loss": 1.1308, "step": 9565 }, { "epoch": 0.47802197802197804, "grad_norm": 1.9251481294631958, "learning_rate": 5.326469239003008e-05, "loss": 1.0184, "step": 9570 }, { "epoch": 0.47827172827172826, "grad_norm": 1.6023755073547363, "learning_rate": 5.323920689127887e-05, "loss": 1.0223, "step": 9575 }, { "epoch": 0.4785214785214785, "grad_norm": 1.876654028892517, "learning_rate": 5.321372139252765e-05, "loss": 1.0599, "step": 9580 }, { "epoch": 0.4787712287712288, "grad_norm": 2.49786376953125, "learning_rate": 5.3188235893776436e-05, "loss": 1.0659, "step": 9585 }, { "epoch": 0.479020979020979, "grad_norm": 1.637902855873108, "learning_rate": 5.316275039502523e-05, "loss": 1.0834, "step": 9590 }, { "epoch": 0.4792707292707293, "grad_norm": 2.210360527038574, "learning_rate": 5.313726489627402e-05, "loss": 1.0125, "step": 9595 }, { "epoch": 0.47952047952047955, "grad_norm": 1.5040931701660156, "learning_rate": 5.311177939752281e-05, "loss": 0.9377, "step": 9600 }, { "epoch": 0.47977022977022976, "grad_norm": 1.9619786739349365, "learning_rate": 5.30862938987716e-05, "loss": 0.9852, "step": 9605 }, { "epoch": 0.48001998001998003, "grad_norm": 1.980612874031067, "learning_rate": 5.306080840002039e-05, "loss": 0.9613, "step": 9610 }, { "epoch": 0.48026973026973024, "grad_norm": 1.9338730573654175, "learning_rate": 5.303532290126918e-05, "loss": 1.0631, "step": 9615 }, { "epoch": 0.4805194805194805, "grad_norm": 1.8526195287704468, "learning_rate": 5.300983740251797e-05, "loss": 0.982, "step": 9620 }, { "epoch": 0.4807692307692308, "grad_norm": 1.8020743131637573, "learning_rate": 5.298435190376676e-05, "loss": 1.0699, "step": 9625 }, { "epoch": 0.481018981018981, "grad_norm": 2.1518824100494385, "learning_rate": 5.295886640501555e-05, "loss": 0.9867, "step": 9630 }, { "epoch": 0.48126873126873126, "grad_norm": 2.0896096229553223, "learning_rate": 5.293338090626434e-05, "loss": 0.9724, "step": 9635 }, { "epoch": 0.48151848151848153, "grad_norm": 1.5658822059631348, "learning_rate": 5.290789540751313e-05, "loss": 0.9926, "step": 9640 }, { "epoch": 0.48176823176823175, "grad_norm": 1.671790599822998, "learning_rate": 5.288240990876192e-05, "loss": 0.9353, "step": 9645 }, { "epoch": 0.482017982017982, "grad_norm": 1.7988522052764893, "learning_rate": 5.2856924410010714e-05, "loss": 1.0307, "step": 9650 }, { "epoch": 0.4822677322677323, "grad_norm": 1.5933271646499634, "learning_rate": 5.28314389112595e-05, "loss": 0.9884, "step": 9655 }, { "epoch": 0.4825174825174825, "grad_norm": 1.8909810781478882, "learning_rate": 5.280595341250828e-05, "loss": 1.0094, "step": 9660 }, { "epoch": 0.48276723276723277, "grad_norm": 1.9581259489059448, "learning_rate": 5.278046791375707e-05, "loss": 1.0, "step": 9665 }, { "epoch": 0.48301698301698304, "grad_norm": 1.6882386207580566, "learning_rate": 5.275498241500586e-05, "loss": 0.9861, "step": 9670 }, { "epoch": 0.48326673326673325, "grad_norm": 1.948276162147522, "learning_rate": 5.272949691625465e-05, "loss": 1.0852, "step": 9675 }, { "epoch": 0.4835164835164835, "grad_norm": 1.9937770366668701, "learning_rate": 5.270401141750344e-05, "loss": 1.009, "step": 9680 }, { "epoch": 0.4837662337662338, "grad_norm": 1.7799137830734253, "learning_rate": 5.267852591875223e-05, "loss": 1.0792, "step": 9685 }, { "epoch": 0.484015984015984, "grad_norm": 1.5962083339691162, "learning_rate": 5.265304042000102e-05, "loss": 1.0208, "step": 9690 }, { "epoch": 0.48426573426573427, "grad_norm": 2.4298648834228516, "learning_rate": 5.262755492124981e-05, "loss": 0.9077, "step": 9695 }, { "epoch": 0.48451548451548454, "grad_norm": 2.082685708999634, "learning_rate": 5.26020694224986e-05, "loss": 1.1083, "step": 9700 }, { "epoch": 0.48476523476523475, "grad_norm": 2.105766534805298, "learning_rate": 5.2576583923747394e-05, "loss": 1.0371, "step": 9705 }, { "epoch": 0.485014985014985, "grad_norm": 1.6959750652313232, "learning_rate": 5.255109842499618e-05, "loss": 1.046, "step": 9710 }, { "epoch": 0.4852647352647353, "grad_norm": 1.850319504737854, "learning_rate": 5.252561292624497e-05, "loss": 1.112, "step": 9715 }, { "epoch": 0.4855144855144855, "grad_norm": 2.144993305206299, "learning_rate": 5.2500127427493764e-05, "loss": 1.0043, "step": 9720 }, { "epoch": 0.4857642357642358, "grad_norm": 1.7373464107513428, "learning_rate": 5.247464192874255e-05, "loss": 1.0088, "step": 9725 }, { "epoch": 0.486013986013986, "grad_norm": 2.5161895751953125, "learning_rate": 5.244915642999134e-05, "loss": 1.0188, "step": 9730 }, { "epoch": 0.48626373626373626, "grad_norm": 2.00431489944458, "learning_rate": 5.2423670931240134e-05, "loss": 1.061, "step": 9735 }, { "epoch": 0.4865134865134865, "grad_norm": 2.0047731399536133, "learning_rate": 5.239818543248891e-05, "loss": 1.031, "step": 9740 }, { "epoch": 0.48676323676323674, "grad_norm": 2.0220327377319336, "learning_rate": 5.2372699933737704e-05, "loss": 0.9459, "step": 9745 }, { "epoch": 0.487012987012987, "grad_norm": 2.7864863872528076, "learning_rate": 5.234721443498649e-05, "loss": 1.0028, "step": 9750 }, { "epoch": 0.4872627372627373, "grad_norm": 1.7160661220550537, "learning_rate": 5.232172893623528e-05, "loss": 1.1512, "step": 9755 }, { "epoch": 0.4875124875124875, "grad_norm": 1.8996385335922241, "learning_rate": 5.2296243437484074e-05, "loss": 1.0145, "step": 9760 }, { "epoch": 0.48776223776223776, "grad_norm": 2.0334906578063965, "learning_rate": 5.227075793873286e-05, "loss": 1.0757, "step": 9765 }, { "epoch": 0.48801198801198803, "grad_norm": 1.9748685359954834, "learning_rate": 5.224527243998165e-05, "loss": 1.0311, "step": 9770 }, { "epoch": 0.48826173826173824, "grad_norm": 1.756616234779358, "learning_rate": 5.2219786941230444e-05, "loss": 0.985, "step": 9775 }, { "epoch": 0.4885114885114885, "grad_norm": 2.3403072357177734, "learning_rate": 5.219430144247923e-05, "loss": 1.0214, "step": 9780 }, { "epoch": 0.4887612387612388, "grad_norm": 6.317171573638916, "learning_rate": 5.216881594372802e-05, "loss": 1.0533, "step": 9785 }, { "epoch": 0.489010989010989, "grad_norm": 1.7395260334014893, "learning_rate": 5.2143330444976814e-05, "loss": 0.9527, "step": 9790 }, { "epoch": 0.48926073926073926, "grad_norm": 1.6687748432159424, "learning_rate": 5.21178449462256e-05, "loss": 1.0562, "step": 9795 }, { "epoch": 0.48951048951048953, "grad_norm": 1.660396695137024, "learning_rate": 5.209235944747439e-05, "loss": 1.099, "step": 9800 }, { "epoch": 0.48976023976023975, "grad_norm": 1.9136061668395996, "learning_rate": 5.2066873948723183e-05, "loss": 0.9754, "step": 9805 }, { "epoch": 0.49000999000999, "grad_norm": 1.8691514730453491, "learning_rate": 5.2041388449971976e-05, "loss": 0.9961, "step": 9810 }, { "epoch": 0.4902597402597403, "grad_norm": 1.9402247667312622, "learning_rate": 5.201590295122076e-05, "loss": 1.007, "step": 9815 }, { "epoch": 0.4905094905094905, "grad_norm": 1.8777105808258057, "learning_rate": 5.199041745246954e-05, "loss": 1.0694, "step": 9820 }, { "epoch": 0.49075924075924077, "grad_norm": 2.2065234184265137, "learning_rate": 5.196493195371833e-05, "loss": 1.0823, "step": 9825 }, { "epoch": 0.49100899100899104, "grad_norm": 1.5529628992080688, "learning_rate": 5.1939446454967124e-05, "loss": 1.0589, "step": 9830 }, { "epoch": 0.49125874125874125, "grad_norm": 1.569588303565979, "learning_rate": 5.191396095621591e-05, "loss": 1.0351, "step": 9835 }, { "epoch": 0.4915084915084915, "grad_norm": 1.7125402688980103, "learning_rate": 5.18884754574647e-05, "loss": 1.0685, "step": 9840 }, { "epoch": 0.49175824175824173, "grad_norm": 1.4826292991638184, "learning_rate": 5.1862989958713494e-05, "loss": 1.0228, "step": 9845 }, { "epoch": 0.492007992007992, "grad_norm": 1.5088510513305664, "learning_rate": 5.1837504459962286e-05, "loss": 0.9639, "step": 9850 }, { "epoch": 0.49225774225774227, "grad_norm": 2.9274518489837646, "learning_rate": 5.181201896121107e-05, "loss": 1.0581, "step": 9855 }, { "epoch": 0.4925074925074925, "grad_norm": 1.8532190322875977, "learning_rate": 5.1786533462459864e-05, "loss": 1.1132, "step": 9860 }, { "epoch": 0.49275724275724275, "grad_norm": 1.8097116947174072, "learning_rate": 5.1761047963708656e-05, "loss": 1.0695, "step": 9865 }, { "epoch": 0.493006993006993, "grad_norm": 1.6371272802352905, "learning_rate": 5.173556246495744e-05, "loss": 1.0697, "step": 9870 }, { "epoch": 0.49325674325674324, "grad_norm": 2.032860040664673, "learning_rate": 5.171007696620623e-05, "loss": 1.0223, "step": 9875 }, { "epoch": 0.4935064935064935, "grad_norm": 1.9306037425994873, "learning_rate": 5.1684591467455026e-05, "loss": 1.0477, "step": 9880 }, { "epoch": 0.4937562437562438, "grad_norm": 1.8617630004882812, "learning_rate": 5.165910596870381e-05, "loss": 1.0413, "step": 9885 }, { "epoch": 0.494005994005994, "grad_norm": 1.614455223083496, "learning_rate": 5.16336204699526e-05, "loss": 1.0074, "step": 9890 }, { "epoch": 0.49425574425574426, "grad_norm": 1.5984388589859009, "learning_rate": 5.1608134971201395e-05, "loss": 1.0092, "step": 9895 }, { "epoch": 0.4945054945054945, "grad_norm": 1.8377598524093628, "learning_rate": 5.1582649472450174e-05, "loss": 1.0425, "step": 9900 }, { "epoch": 0.49475524475524474, "grad_norm": 2.13116455078125, "learning_rate": 5.1557163973698966e-05, "loss": 1.0732, "step": 9905 }, { "epoch": 0.495004995004995, "grad_norm": 2.376474142074585, "learning_rate": 5.153167847494775e-05, "loss": 1.0529, "step": 9910 }, { "epoch": 0.4952547452547453, "grad_norm": 1.6326441764831543, "learning_rate": 5.1506192976196544e-05, "loss": 1.0537, "step": 9915 }, { "epoch": 0.4955044955044955, "grad_norm": 1.5635850429534912, "learning_rate": 5.1480707477445336e-05, "loss": 1.0153, "step": 9920 }, { "epoch": 0.49575424575424576, "grad_norm": 1.8969775438308716, "learning_rate": 5.145522197869412e-05, "loss": 0.9585, "step": 9925 }, { "epoch": 0.49600399600399603, "grad_norm": 1.8579003810882568, "learning_rate": 5.1429736479942913e-05, "loss": 0.9994, "step": 9930 }, { "epoch": 0.49625374625374624, "grad_norm": 1.75138521194458, "learning_rate": 5.1404250981191706e-05, "loss": 1.0777, "step": 9935 }, { "epoch": 0.4965034965034965, "grad_norm": 2.3218162059783936, "learning_rate": 5.137876548244049e-05, "loss": 0.9907, "step": 9940 }, { "epoch": 0.4967532467532468, "grad_norm": 1.9108942747116089, "learning_rate": 5.135327998368928e-05, "loss": 1.0027, "step": 9945 }, { "epoch": 0.497002997002997, "grad_norm": 1.7440747022628784, "learning_rate": 5.1327794484938075e-05, "loss": 1.118, "step": 9950 }, { "epoch": 0.49725274725274726, "grad_norm": 1.92374587059021, "learning_rate": 5.130230898618687e-05, "loss": 0.962, "step": 9955 }, { "epoch": 0.4975024975024975, "grad_norm": 2.1420788764953613, "learning_rate": 5.127682348743565e-05, "loss": 1.0636, "step": 9960 }, { "epoch": 0.49775224775224775, "grad_norm": 1.8587371110916138, "learning_rate": 5.1251337988684445e-05, "loss": 1.0209, "step": 9965 }, { "epoch": 0.498001998001998, "grad_norm": 2.2004218101501465, "learning_rate": 5.122585248993324e-05, "loss": 1.0235, "step": 9970 }, { "epoch": 0.4982517482517482, "grad_norm": 1.9216978549957275, "learning_rate": 5.120036699118202e-05, "loss": 0.9943, "step": 9975 }, { "epoch": 0.4985014985014985, "grad_norm": 1.9018975496292114, "learning_rate": 5.11748814924308e-05, "loss": 1.13, "step": 9980 }, { "epoch": 0.49875124875124877, "grad_norm": 1.9166200160980225, "learning_rate": 5.1149395993679594e-05, "loss": 0.9816, "step": 9985 }, { "epoch": 0.499000999000999, "grad_norm": 2.0807321071624756, "learning_rate": 5.1123910494928386e-05, "loss": 0.91, "step": 9990 }, { "epoch": 0.49925074925074925, "grad_norm": 1.4968241453170776, "learning_rate": 5.109842499617717e-05, "loss": 0.9956, "step": 9995 }, { "epoch": 0.4995004995004995, "grad_norm": 2.1511130332946777, "learning_rate": 5.107293949742596e-05, "loss": 1.1332, "step": 10000 }, { "epoch": 0.49975024975024973, "grad_norm": 1.986641764640808, "learning_rate": 5.1047453998674756e-05, "loss": 0.9728, "step": 10005 }, { "epoch": 0.5, "grad_norm": 1.4859191179275513, "learning_rate": 5.102196849992355e-05, "loss": 0.9851, "step": 10010 }, { "epoch": 0.5002497502497503, "grad_norm": 1.8236111402511597, "learning_rate": 5.099648300117233e-05, "loss": 0.9991, "step": 10015 }, { "epoch": 0.5004995004995005, "grad_norm": 2.0913445949554443, "learning_rate": 5.0970997502421125e-05, "loss": 1.0044, "step": 10020 }, { "epoch": 0.5007492507492507, "grad_norm": 1.731346845626831, "learning_rate": 5.094551200366992e-05, "loss": 1.0094, "step": 10025 }, { "epoch": 0.500999000999001, "grad_norm": 5.4002251625061035, "learning_rate": 5.09200265049187e-05, "loss": 1.0169, "step": 10030 }, { "epoch": 0.5012487512487512, "grad_norm": 1.9539554119110107, "learning_rate": 5.0894541006167495e-05, "loss": 0.9076, "step": 10035 }, { "epoch": 0.5014985014985015, "grad_norm": 2.1117892265319824, "learning_rate": 5.086905550741629e-05, "loss": 1.0188, "step": 10040 }, { "epoch": 0.5017482517482518, "grad_norm": 1.7515451908111572, "learning_rate": 5.084357000866507e-05, "loss": 0.9507, "step": 10045 }, { "epoch": 0.501998001998002, "grad_norm": 1.6576108932495117, "learning_rate": 5.0818084509913865e-05, "loss": 1.0068, "step": 10050 }, { "epoch": 0.5022477522477522, "grad_norm": 1.8035262823104858, "learning_rate": 5.079259901116266e-05, "loss": 0.956, "step": 10055 }, { "epoch": 0.5024975024975025, "grad_norm": 1.8694809675216675, "learning_rate": 5.0767113512411436e-05, "loss": 1.0582, "step": 10060 }, { "epoch": 0.5027472527472527, "grad_norm": 2.150698184967041, "learning_rate": 5.074162801366023e-05, "loss": 1.1221, "step": 10065 }, { "epoch": 0.502997002997003, "grad_norm": 2.132654905319214, "learning_rate": 5.071614251490901e-05, "loss": 1.0332, "step": 10070 }, { "epoch": 0.5032467532467533, "grad_norm": 1.5558087825775146, "learning_rate": 5.0690657016157805e-05, "loss": 1.0384, "step": 10075 }, { "epoch": 0.5034965034965035, "grad_norm": 2.3355419635772705, "learning_rate": 5.06651715174066e-05, "loss": 1.0224, "step": 10080 }, { "epoch": 0.5037462537462537, "grad_norm": 1.9186372756958008, "learning_rate": 5.063968601865538e-05, "loss": 0.9359, "step": 10085 }, { "epoch": 0.503996003996004, "grad_norm": 1.9626686573028564, "learning_rate": 5.0614200519904175e-05, "loss": 1.1014, "step": 10090 }, { "epoch": 0.5042457542457542, "grad_norm": 1.6580654382705688, "learning_rate": 5.058871502115297e-05, "loss": 1.0668, "step": 10095 }, { "epoch": 0.5044955044955045, "grad_norm": 2.1845762729644775, "learning_rate": 5.056322952240175e-05, "loss": 1.0207, "step": 10100 }, { "epoch": 0.5047452547452548, "grad_norm": 1.8094285726547241, "learning_rate": 5.0537744023650545e-05, "loss": 0.9882, "step": 10105 }, { "epoch": 0.504995004995005, "grad_norm": 2.2469394207000732, "learning_rate": 5.051225852489934e-05, "loss": 0.9822, "step": 10110 }, { "epoch": 0.5052447552447552, "grad_norm": 2.0558369159698486, "learning_rate": 5.048677302614813e-05, "loss": 0.9623, "step": 10115 }, { "epoch": 0.5054945054945055, "grad_norm": 2.155184507369995, "learning_rate": 5.0461287527396915e-05, "loss": 1.1056, "step": 10120 }, { "epoch": 0.5057442557442557, "grad_norm": 1.8485655784606934, "learning_rate": 5.043580202864571e-05, "loss": 1.0959, "step": 10125 }, { "epoch": 0.505994005994006, "grad_norm": 1.9354192018508911, "learning_rate": 5.04103165298945e-05, "loss": 0.9574, "step": 10130 }, { "epoch": 0.5062437562437563, "grad_norm": 1.8394899368286133, "learning_rate": 5.0384831031143284e-05, "loss": 1.0319, "step": 10135 }, { "epoch": 0.5064935064935064, "grad_norm": 2.1897218227386475, "learning_rate": 5.035934553239206e-05, "loss": 0.9867, "step": 10140 }, { "epoch": 0.5067432567432567, "grad_norm": 1.628366470336914, "learning_rate": 5.0333860033640855e-05, "loss": 0.9838, "step": 10145 }, { "epoch": 0.506993006993007, "grad_norm": 1.8402291536331177, "learning_rate": 5.030837453488965e-05, "loss": 1.0369, "step": 10150 }, { "epoch": 0.5072427572427572, "grad_norm": 2.0387606620788574, "learning_rate": 5.028288903613844e-05, "loss": 1.108, "step": 10155 }, { "epoch": 0.5074925074925075, "grad_norm": 1.3570879697799683, "learning_rate": 5.0257403537387225e-05, "loss": 1.095, "step": 10160 }, { "epoch": 0.5077422577422578, "grad_norm": 2.402221202850342, "learning_rate": 5.023191803863602e-05, "loss": 1.0783, "step": 10165 }, { "epoch": 0.5079920079920079, "grad_norm": 2.0921823978424072, "learning_rate": 5.020643253988481e-05, "loss": 0.9985, "step": 10170 }, { "epoch": 0.5082417582417582, "grad_norm": 2.0202977657318115, "learning_rate": 5.0180947041133595e-05, "loss": 1.014, "step": 10175 }, { "epoch": 0.5084915084915085, "grad_norm": 2.0506043434143066, "learning_rate": 5.015546154238239e-05, "loss": 1.0296, "step": 10180 }, { "epoch": 0.5087412587412588, "grad_norm": 1.589611530303955, "learning_rate": 5.012997604363118e-05, "loss": 1.0706, "step": 10185 }, { "epoch": 0.508991008991009, "grad_norm": 1.8982197046279907, "learning_rate": 5.0104490544879965e-05, "loss": 0.9744, "step": 10190 }, { "epoch": 0.5092407592407593, "grad_norm": 1.6929441690444946, "learning_rate": 5.007900504612876e-05, "loss": 1.0274, "step": 10195 }, { "epoch": 0.5094905094905094, "grad_norm": 2.1053647994995117, "learning_rate": 5.005351954737755e-05, "loss": 1.0798, "step": 10200 }, { "epoch": 0.5097402597402597, "grad_norm": 1.7968083620071411, "learning_rate": 5.0028034048626334e-05, "loss": 0.9987, "step": 10205 }, { "epoch": 0.50999000999001, "grad_norm": 1.5884658098220825, "learning_rate": 5.0002548549875127e-05, "loss": 1.0749, "step": 10210 }, { "epoch": 0.5102397602397603, "grad_norm": 2.606158971786499, "learning_rate": 4.997706305112391e-05, "loss": 0.9278, "step": 10215 }, { "epoch": 0.5104895104895105, "grad_norm": 1.9856775999069214, "learning_rate": 4.9951577552372704e-05, "loss": 0.9722, "step": 10220 }, { "epoch": 0.5107392607392608, "grad_norm": 1.4624133110046387, "learning_rate": 4.992609205362149e-05, "loss": 1.0216, "step": 10225 }, { "epoch": 0.510989010989011, "grad_norm": 2.908353805541992, "learning_rate": 4.990060655487028e-05, "loss": 0.9743, "step": 10230 }, { "epoch": 0.5112387612387612, "grad_norm": 1.763126015663147, "learning_rate": 4.9875121056119074e-05, "loss": 1.0103, "step": 10235 }, { "epoch": 0.5114885114885115, "grad_norm": 2.1565921306610107, "learning_rate": 4.984963555736786e-05, "loss": 1.1071, "step": 10240 }, { "epoch": 0.5117382617382618, "grad_norm": 1.790597677230835, "learning_rate": 4.9824150058616645e-05, "loss": 1.018, "step": 10245 }, { "epoch": 0.511988011988012, "grad_norm": 1.7595652341842651, "learning_rate": 4.979866455986544e-05, "loss": 1.1061, "step": 10250 }, { "epoch": 0.5122377622377622, "grad_norm": 1.3077837228775024, "learning_rate": 4.977317906111423e-05, "loss": 0.9488, "step": 10255 }, { "epoch": 0.5124875124875125, "grad_norm": 1.7513136863708496, "learning_rate": 4.974769356236302e-05, "loss": 1.054, "step": 10260 }, { "epoch": 0.5127372627372627, "grad_norm": 1.7060003280639648, "learning_rate": 4.972220806361181e-05, "loss": 1.1671, "step": 10265 }, { "epoch": 0.512987012987013, "grad_norm": 1.739479422569275, "learning_rate": 4.96967225648606e-05, "loss": 0.9976, "step": 10270 }, { "epoch": 0.5132367632367633, "grad_norm": 1.94993257522583, "learning_rate": 4.967123706610939e-05, "loss": 1.0179, "step": 10275 }, { "epoch": 0.5134865134865135, "grad_norm": 1.7628893852233887, "learning_rate": 4.964575156735817e-05, "loss": 1.19, "step": 10280 }, { "epoch": 0.5137362637362637, "grad_norm": 1.5925805568695068, "learning_rate": 4.962026606860696e-05, "loss": 0.9882, "step": 10285 }, { "epoch": 0.513986013986014, "grad_norm": 1.5014348030090332, "learning_rate": 4.9594780569855754e-05, "loss": 0.9788, "step": 10290 }, { "epoch": 0.5142357642357642, "grad_norm": 2.152209520339966, "learning_rate": 4.9569295071104546e-05, "loss": 1.1119, "step": 10295 }, { "epoch": 0.5144855144855145, "grad_norm": 2.4033255577087402, "learning_rate": 4.954380957235333e-05, "loss": 1.0036, "step": 10300 }, { "epoch": 0.5147352647352648, "grad_norm": 1.6501697301864624, "learning_rate": 4.9518324073602124e-05, "loss": 1.037, "step": 10305 }, { "epoch": 0.514985014985015, "grad_norm": 4.180097579956055, "learning_rate": 4.9492838574850916e-05, "loss": 0.9687, "step": 10310 }, { "epoch": 0.5152347652347652, "grad_norm": 1.948249340057373, "learning_rate": 4.94673530760997e-05, "loss": 1.0369, "step": 10315 }, { "epoch": 0.5154845154845155, "grad_norm": 3.289567470550537, "learning_rate": 4.944186757734849e-05, "loss": 1.0376, "step": 10320 }, { "epoch": 0.5157342657342657, "grad_norm": 2.7288527488708496, "learning_rate": 4.941638207859728e-05, "loss": 1.0269, "step": 10325 }, { "epoch": 0.515984015984016, "grad_norm": 1.7896794080734253, "learning_rate": 4.939089657984607e-05, "loss": 0.9994, "step": 10330 }, { "epoch": 0.5162337662337663, "grad_norm": 2.6502676010131836, "learning_rate": 4.9365411081094857e-05, "loss": 1.0814, "step": 10335 }, { "epoch": 0.5164835164835165, "grad_norm": 1.6050525903701782, "learning_rate": 4.933992558234365e-05, "loss": 0.9617, "step": 10340 }, { "epoch": 0.5167332667332667, "grad_norm": 1.5096427202224731, "learning_rate": 4.931444008359244e-05, "loss": 1.0294, "step": 10345 }, { "epoch": 0.516983016983017, "grad_norm": 2.4326164722442627, "learning_rate": 4.9288954584841226e-05, "loss": 1.014, "step": 10350 }, { "epoch": 0.5172327672327672, "grad_norm": 2.0236129760742188, "learning_rate": 4.926346908609002e-05, "loss": 0.9694, "step": 10355 }, { "epoch": 0.5174825174825175, "grad_norm": 1.9614331722259521, "learning_rate": 4.9237983587338804e-05, "loss": 1.066, "step": 10360 }, { "epoch": 0.5177322677322678, "grad_norm": 2.038485288619995, "learning_rate": 4.9212498088587596e-05, "loss": 1.0766, "step": 10365 }, { "epoch": 0.5179820179820179, "grad_norm": 1.7841075658798218, "learning_rate": 4.918701258983638e-05, "loss": 1.066, "step": 10370 }, { "epoch": 0.5182317682317682, "grad_norm": 1.6271713972091675, "learning_rate": 4.9161527091085174e-05, "loss": 1.0764, "step": 10375 }, { "epoch": 0.5184815184815185, "grad_norm": 1.5709089040756226, "learning_rate": 4.9136041592333966e-05, "loss": 0.9778, "step": 10380 }, { "epoch": 0.5187312687312687, "grad_norm": 1.7191498279571533, "learning_rate": 4.911055609358275e-05, "loss": 1.0708, "step": 10385 }, { "epoch": 0.518981018981019, "grad_norm": 3.875063180923462, "learning_rate": 4.9085070594831543e-05, "loss": 1.0486, "step": 10390 }, { "epoch": 0.5192307692307693, "grad_norm": 1.8126662969589233, "learning_rate": 4.9059585096080336e-05, "loss": 0.9436, "step": 10395 }, { "epoch": 0.5194805194805194, "grad_norm": 2.021678924560547, "learning_rate": 4.903409959732912e-05, "loss": 1.09, "step": 10400 }, { "epoch": 0.5197302697302697, "grad_norm": 1.782208800315857, "learning_rate": 4.9008614098577906e-05, "loss": 1.1084, "step": 10405 }, { "epoch": 0.51998001998002, "grad_norm": 1.702735424041748, "learning_rate": 4.89831285998267e-05, "loss": 1.1744, "step": 10410 }, { "epoch": 0.5202297702297702, "grad_norm": 2.3281002044677734, "learning_rate": 4.895764310107549e-05, "loss": 0.8744, "step": 10415 }, { "epoch": 0.5204795204795205, "grad_norm": 1.8853106498718262, "learning_rate": 4.893215760232428e-05, "loss": 1.0054, "step": 10420 }, { "epoch": 0.5207292707292708, "grad_norm": 2.0370025634765625, "learning_rate": 4.890667210357307e-05, "loss": 1.1139, "step": 10425 }, { "epoch": 0.5209790209790209, "grad_norm": 2.029564142227173, "learning_rate": 4.888118660482186e-05, "loss": 1.0848, "step": 10430 }, { "epoch": 0.5212287712287712, "grad_norm": 1.8114051818847656, "learning_rate": 4.885570110607065e-05, "loss": 0.9886, "step": 10435 }, { "epoch": 0.5214785214785215, "grad_norm": 2.2774369716644287, "learning_rate": 4.883021560731944e-05, "loss": 1.0019, "step": 10440 }, { "epoch": 0.5217282717282717, "grad_norm": 5.291658878326416, "learning_rate": 4.8804730108568224e-05, "loss": 1.0009, "step": 10445 }, { "epoch": 0.521978021978022, "grad_norm": 1.7368401288986206, "learning_rate": 4.8779244609817016e-05, "loss": 0.9392, "step": 10450 }, { "epoch": 0.5222277722277723, "grad_norm": 1.9365220069885254, "learning_rate": 4.875375911106581e-05, "loss": 1.1058, "step": 10455 }, { "epoch": 0.5224775224775224, "grad_norm": 2.3800556659698486, "learning_rate": 4.872827361231459e-05, "loss": 1.03, "step": 10460 }, { "epoch": 0.5227272727272727, "grad_norm": 1.827813982963562, "learning_rate": 4.8702788113563386e-05, "loss": 1.0351, "step": 10465 }, { "epoch": 0.522977022977023, "grad_norm": 2.049013137817383, "learning_rate": 4.867730261481218e-05, "loss": 0.9143, "step": 10470 }, { "epoch": 0.5232267732267732, "grad_norm": 1.9797253608703613, "learning_rate": 4.865181711606096e-05, "loss": 1.0236, "step": 10475 }, { "epoch": 0.5234765234765235, "grad_norm": 1.7983263731002808, "learning_rate": 4.862633161730975e-05, "loss": 0.9733, "step": 10480 }, { "epoch": 0.5237262737262737, "grad_norm": 1.7003098726272583, "learning_rate": 4.860084611855854e-05, "loss": 1.0161, "step": 10485 }, { "epoch": 0.5239760239760239, "grad_norm": 1.630346417427063, "learning_rate": 4.857536061980733e-05, "loss": 1.0695, "step": 10490 }, { "epoch": 0.5242257742257742, "grad_norm": 1.960113525390625, "learning_rate": 4.854987512105612e-05, "loss": 1.0004, "step": 10495 }, { "epoch": 0.5244755244755245, "grad_norm": 2.1183526515960693, "learning_rate": 4.852438962230491e-05, "loss": 1.0354, "step": 10500 }, { "epoch": 0.5247252747252747, "grad_norm": 1.6372559070587158, "learning_rate": 4.84989041235537e-05, "loss": 1.1108, "step": 10505 }, { "epoch": 0.524975024975025, "grad_norm": 1.4921700954437256, "learning_rate": 4.847341862480249e-05, "loss": 1.0169, "step": 10510 }, { "epoch": 0.5252247752247752, "grad_norm": 2.095839738845825, "learning_rate": 4.844793312605128e-05, "loss": 1.063, "step": 10515 }, { "epoch": 0.5254745254745254, "grad_norm": 1.6922940015792847, "learning_rate": 4.8422447627300066e-05, "loss": 1.0939, "step": 10520 }, { "epoch": 0.5257242757242757, "grad_norm": 1.4682185649871826, "learning_rate": 4.839696212854886e-05, "loss": 0.977, "step": 10525 }, { "epoch": 0.525974025974026, "grad_norm": 1.6749604940414429, "learning_rate": 4.837147662979764e-05, "loss": 1.0334, "step": 10530 }, { "epoch": 0.5262237762237763, "grad_norm": 1.7483326196670532, "learning_rate": 4.8345991131046435e-05, "loss": 1.0438, "step": 10535 }, { "epoch": 0.5264735264735265, "grad_norm": 1.727408766746521, "learning_rate": 4.832050563229523e-05, "loss": 1.005, "step": 10540 }, { "epoch": 0.5267232767232767, "grad_norm": 1.73333740234375, "learning_rate": 4.829502013354402e-05, "loss": 1.0206, "step": 10545 }, { "epoch": 0.526973026973027, "grad_norm": 1.9999842643737793, "learning_rate": 4.8269534634792805e-05, "loss": 1.0239, "step": 10550 }, { "epoch": 0.5272227772227772, "grad_norm": 2.2529776096343994, "learning_rate": 4.82440491360416e-05, "loss": 1.0556, "step": 10555 }, { "epoch": 0.5274725274725275, "grad_norm": 1.7254530191421509, "learning_rate": 4.821856363729038e-05, "loss": 0.9879, "step": 10560 }, { "epoch": 0.5277222777222778, "grad_norm": 1.9978423118591309, "learning_rate": 4.8193078138539175e-05, "loss": 1.0814, "step": 10565 }, { "epoch": 0.527972027972028, "grad_norm": 1.8624396324157715, "learning_rate": 4.816759263978796e-05, "loss": 0.9745, "step": 10570 }, { "epoch": 0.5282217782217782, "grad_norm": 2.1891183853149414, "learning_rate": 4.814210714103675e-05, "loss": 1.0517, "step": 10575 }, { "epoch": 0.5284715284715285, "grad_norm": 1.569190502166748, "learning_rate": 4.8116621642285545e-05, "loss": 0.9747, "step": 10580 }, { "epoch": 0.5287212787212787, "grad_norm": 1.9173314571380615, "learning_rate": 4.809113614353433e-05, "loss": 1.081, "step": 10585 }, { "epoch": 0.528971028971029, "grad_norm": 2.198762893676758, "learning_rate": 4.806565064478312e-05, "loss": 1.0108, "step": 10590 }, { "epoch": 0.5292207792207793, "grad_norm": 1.721721887588501, "learning_rate": 4.8040165146031914e-05, "loss": 1.0117, "step": 10595 }, { "epoch": 0.5294705294705294, "grad_norm": 2.3400440216064453, "learning_rate": 4.80146796472807e-05, "loss": 1.0857, "step": 10600 }, { "epoch": 0.5297202797202797, "grad_norm": 2.117434024810791, "learning_rate": 4.7989194148529485e-05, "loss": 1.113, "step": 10605 }, { "epoch": 0.52997002997003, "grad_norm": 1.7680771350860596, "learning_rate": 4.796370864977828e-05, "loss": 1.0374, "step": 10610 }, { "epoch": 0.5302197802197802, "grad_norm": 1.979880452156067, "learning_rate": 4.793822315102707e-05, "loss": 1.1341, "step": 10615 }, { "epoch": 0.5304695304695305, "grad_norm": 1.5856854915618896, "learning_rate": 4.7912737652275855e-05, "loss": 1.0012, "step": 10620 }, { "epoch": 0.5307192807192808, "grad_norm": 2.061319589614868, "learning_rate": 4.788725215352465e-05, "loss": 1.1089, "step": 10625 }, { "epoch": 0.5309690309690309, "grad_norm": 2.2198646068573, "learning_rate": 4.786176665477344e-05, "loss": 1.0243, "step": 10630 }, { "epoch": 0.5312187812187812, "grad_norm": 1.793178677558899, "learning_rate": 4.7836281156022225e-05, "loss": 1.0778, "step": 10635 }, { "epoch": 0.5314685314685315, "grad_norm": 1.8761465549468994, "learning_rate": 4.781079565727101e-05, "loss": 0.9923, "step": 10640 }, { "epoch": 0.5317182817182817, "grad_norm": 1.9043720960617065, "learning_rate": 4.77853101585198e-05, "loss": 1.089, "step": 10645 }, { "epoch": 0.531968031968032, "grad_norm": 2.552382469177246, "learning_rate": 4.7759824659768595e-05, "loss": 1.0277, "step": 10650 }, { "epoch": 0.5322177822177823, "grad_norm": 2.227675199508667, "learning_rate": 4.773433916101738e-05, "loss": 1.0108, "step": 10655 }, { "epoch": 0.5324675324675324, "grad_norm": 1.9968994855880737, "learning_rate": 4.770885366226617e-05, "loss": 1.0986, "step": 10660 }, { "epoch": 0.5327172827172827, "grad_norm": 1.7655466794967651, "learning_rate": 4.7683368163514964e-05, "loss": 1.0221, "step": 10665 }, { "epoch": 0.532967032967033, "grad_norm": 2.289557695388794, "learning_rate": 4.7657882664763757e-05, "loss": 0.9983, "step": 10670 }, { "epoch": 0.5332167832167832, "grad_norm": 2.119030237197876, "learning_rate": 4.763239716601254e-05, "loss": 1.0453, "step": 10675 }, { "epoch": 0.5334665334665335, "grad_norm": 1.9617866277694702, "learning_rate": 4.760691166726133e-05, "loss": 1.0272, "step": 10680 }, { "epoch": 0.5337162837162838, "grad_norm": 1.8910876512527466, "learning_rate": 4.758142616851012e-05, "loss": 1.0399, "step": 10685 }, { "epoch": 0.5339660339660339, "grad_norm": 2.7560293674468994, "learning_rate": 4.7555940669758905e-05, "loss": 0.9488, "step": 10690 }, { "epoch": 0.5342157842157842, "grad_norm": 2.6436283588409424, "learning_rate": 4.75304551710077e-05, "loss": 0.942, "step": 10695 }, { "epoch": 0.5344655344655345, "grad_norm": 1.70419442653656, "learning_rate": 4.750496967225649e-05, "loss": 1.0147, "step": 10700 }, { "epoch": 0.5347152847152847, "grad_norm": 1.6955325603485107, "learning_rate": 4.747948417350528e-05, "loss": 1.0518, "step": 10705 }, { "epoch": 0.534965034965035, "grad_norm": 1.6821141242980957, "learning_rate": 4.745399867475407e-05, "loss": 0.9931, "step": 10710 }, { "epoch": 0.5352147852147852, "grad_norm": 1.9281139373779297, "learning_rate": 4.742851317600286e-05, "loss": 1.0315, "step": 10715 }, { "epoch": 0.5354645354645354, "grad_norm": 2.4733364582061768, "learning_rate": 4.7403027677251644e-05, "loss": 0.9774, "step": 10720 }, { "epoch": 0.5357142857142857, "grad_norm": 2.166369915008545, "learning_rate": 4.737754217850044e-05, "loss": 1.097, "step": 10725 }, { "epoch": 0.535964035964036, "grad_norm": 1.7221876382827759, "learning_rate": 4.735205667974922e-05, "loss": 1.0157, "step": 10730 }, { "epoch": 0.5362137862137862, "grad_norm": 1.7203466892242432, "learning_rate": 4.7326571180998014e-05, "loss": 0.979, "step": 10735 }, { "epoch": 0.5364635364635365, "grad_norm": 2.4112048149108887, "learning_rate": 4.7301085682246806e-05, "loss": 1.0007, "step": 10740 }, { "epoch": 0.5367132867132867, "grad_norm": 1.6929134130477905, "learning_rate": 4.727560018349559e-05, "loss": 1.0086, "step": 10745 }, { "epoch": 0.5369630369630369, "grad_norm": 2.041346788406372, "learning_rate": 4.7250114684744384e-05, "loss": 0.9702, "step": 10750 }, { "epoch": 0.5372127872127872, "grad_norm": 2.1518359184265137, "learning_rate": 4.7224629185993176e-05, "loss": 0.9921, "step": 10755 }, { "epoch": 0.5374625374625375, "grad_norm": 1.6090643405914307, "learning_rate": 4.719914368724196e-05, "loss": 1.0367, "step": 10760 }, { "epoch": 0.5377122877122877, "grad_norm": 2.1290054321289062, "learning_rate": 4.717365818849075e-05, "loss": 1.0577, "step": 10765 }, { "epoch": 0.537962037962038, "grad_norm": 1.9955767393112183, "learning_rate": 4.714817268973954e-05, "loss": 1.1055, "step": 10770 }, { "epoch": 0.5382117882117882, "grad_norm": 2.349576234817505, "learning_rate": 4.712268719098833e-05, "loss": 0.932, "step": 10775 }, { "epoch": 0.5384615384615384, "grad_norm": 2.69212007522583, "learning_rate": 4.709720169223712e-05, "loss": 1.0717, "step": 10780 }, { "epoch": 0.5387112887112887, "grad_norm": 2.191384792327881, "learning_rate": 4.707171619348591e-05, "loss": 1.0004, "step": 10785 }, { "epoch": 0.538961038961039, "grad_norm": 1.8525850772857666, "learning_rate": 4.70462306947347e-05, "loss": 1.1096, "step": 10790 }, { "epoch": 0.5392107892107892, "grad_norm": 2.097797393798828, "learning_rate": 4.7020745195983487e-05, "loss": 0.9274, "step": 10795 }, { "epoch": 0.5394605394605395, "grad_norm": 1.8570269346237183, "learning_rate": 4.699525969723227e-05, "loss": 0.9238, "step": 10800 }, { "epoch": 0.5397102897102897, "grad_norm": 1.6937276124954224, "learning_rate": 4.6969774198481064e-05, "loss": 1.0377, "step": 10805 }, { "epoch": 0.5399600399600399, "grad_norm": 1.8426470756530762, "learning_rate": 4.6944288699729856e-05, "loss": 0.9459, "step": 10810 }, { "epoch": 0.5402097902097902, "grad_norm": 1.8896816968917847, "learning_rate": 4.691880320097864e-05, "loss": 1.1211, "step": 10815 }, { "epoch": 0.5404595404595405, "grad_norm": 1.9658395051956177, "learning_rate": 4.6893317702227434e-05, "loss": 1.0919, "step": 10820 }, { "epoch": 0.5407092907092907, "grad_norm": 1.633219599723816, "learning_rate": 4.6867832203476226e-05, "loss": 0.9951, "step": 10825 }, { "epoch": 0.5409590409590409, "grad_norm": 1.8907859325408936, "learning_rate": 4.684234670472502e-05, "loss": 1.0687, "step": 10830 }, { "epoch": 0.5412087912087912, "grad_norm": 1.72572922706604, "learning_rate": 4.6816861205973804e-05, "loss": 0.969, "step": 10835 }, { "epoch": 0.5414585414585414, "grad_norm": 1.736729621887207, "learning_rate": 4.679137570722259e-05, "loss": 0.9996, "step": 10840 }, { "epoch": 0.5417082917082917, "grad_norm": 1.7717705965042114, "learning_rate": 4.676589020847138e-05, "loss": 0.9818, "step": 10845 }, { "epoch": 0.541958041958042, "grad_norm": 1.683103322982788, "learning_rate": 4.6740404709720173e-05, "loss": 1.0524, "step": 10850 }, { "epoch": 0.5422077922077922, "grad_norm": 2.105308771133423, "learning_rate": 4.671491921096896e-05, "loss": 1.0222, "step": 10855 }, { "epoch": 0.5424575424575424, "grad_norm": 2.490571975708008, "learning_rate": 4.668943371221775e-05, "loss": 1.095, "step": 10860 }, { "epoch": 0.5427072927072927, "grad_norm": 1.5671800374984741, "learning_rate": 4.666394821346654e-05, "loss": 1.0849, "step": 10865 }, { "epoch": 0.542957042957043, "grad_norm": 1.7819477319717407, "learning_rate": 4.663846271471533e-05, "loss": 1.0855, "step": 10870 }, { "epoch": 0.5432067932067932, "grad_norm": 1.7613551616668701, "learning_rate": 4.661297721596412e-05, "loss": 1.0047, "step": 10875 }, { "epoch": 0.5434565434565435, "grad_norm": 2.4095914363861084, "learning_rate": 4.6587491717212906e-05, "loss": 0.9901, "step": 10880 }, { "epoch": 0.5437062937062938, "grad_norm": 3.7656748294830322, "learning_rate": 4.65620062184617e-05, "loss": 0.9173, "step": 10885 }, { "epoch": 0.5439560439560439, "grad_norm": 2.591468334197998, "learning_rate": 4.6536520719710484e-05, "loss": 1.0743, "step": 10890 }, { "epoch": 0.5442057942057942, "grad_norm": 1.5704014301300049, "learning_rate": 4.6511035220959276e-05, "loss": 0.9602, "step": 10895 }, { "epoch": 0.5444555444555444, "grad_norm": 1.5144566297531128, "learning_rate": 4.648554972220807e-05, "loss": 0.9522, "step": 10900 }, { "epoch": 0.5447052947052947, "grad_norm": 2.26674747467041, "learning_rate": 4.6460064223456854e-05, "loss": 0.9859, "step": 10905 }, { "epoch": 0.544955044955045, "grad_norm": 2.4553797245025635, "learning_rate": 4.6434578724705646e-05, "loss": 1.0091, "step": 10910 }, { "epoch": 0.5452047952047953, "grad_norm": 1.6010972261428833, "learning_rate": 4.640909322595444e-05, "loss": 1.0117, "step": 10915 }, { "epoch": 0.5454545454545454, "grad_norm": 2.0253806114196777, "learning_rate": 4.638360772720322e-05, "loss": 0.9884, "step": 10920 }, { "epoch": 0.5457042957042957, "grad_norm": 1.561132788658142, "learning_rate": 4.635812222845201e-05, "loss": 1.1489, "step": 10925 }, { "epoch": 0.545954045954046, "grad_norm": 1.7576236724853516, "learning_rate": 4.63326367297008e-05, "loss": 1.0801, "step": 10930 }, { "epoch": 0.5462037962037962, "grad_norm": 1.881152629852295, "learning_rate": 4.630715123094959e-05, "loss": 1.0233, "step": 10935 }, { "epoch": 0.5464535464535465, "grad_norm": 1.9929344654083252, "learning_rate": 4.628166573219838e-05, "loss": 0.9501, "step": 10940 }, { "epoch": 0.5467032967032966, "grad_norm": 1.7059142589569092, "learning_rate": 4.625618023344717e-05, "loss": 0.9522, "step": 10945 }, { "epoch": 0.5469530469530469, "grad_norm": 2.003077268600464, "learning_rate": 4.623069473469596e-05, "loss": 0.9634, "step": 10950 }, { "epoch": 0.5472027972027972, "grad_norm": 2.015565872192383, "learning_rate": 4.6205209235944755e-05, "loss": 1.0357, "step": 10955 }, { "epoch": 0.5474525474525475, "grad_norm": 1.803523302078247, "learning_rate": 4.6179723737193534e-05, "loss": 1.0589, "step": 10960 }, { "epoch": 0.5477022977022977, "grad_norm": 1.7196156978607178, "learning_rate": 4.6154238238442326e-05, "loss": 1.0245, "step": 10965 }, { "epoch": 0.547952047952048, "grad_norm": 1.8932229280471802, "learning_rate": 4.612875273969112e-05, "loss": 1.037, "step": 10970 }, { "epoch": 0.5482017982017982, "grad_norm": 1.7025026082992554, "learning_rate": 4.610326724093991e-05, "loss": 0.9478, "step": 10975 }, { "epoch": 0.5484515484515484, "grad_norm": 2.3904478549957275, "learning_rate": 4.6077781742188696e-05, "loss": 0.9772, "step": 10980 }, { "epoch": 0.5487012987012987, "grad_norm": 1.9494946002960205, "learning_rate": 4.605229624343749e-05, "loss": 1.0074, "step": 10985 }, { "epoch": 0.548951048951049, "grad_norm": 2.6505486965179443, "learning_rate": 4.602681074468628e-05, "loss": 1.0032, "step": 10990 }, { "epoch": 0.5492007992007992, "grad_norm": 2.8636019229888916, "learning_rate": 4.6001325245935065e-05, "loss": 0.9554, "step": 10995 }, { "epoch": 0.5494505494505495, "grad_norm": 2.045607805252075, "learning_rate": 4.597583974718385e-05, "loss": 1.1514, "step": 11000 }, { "epoch": 0.5497002997002997, "grad_norm": 1.8940155506134033, "learning_rate": 4.595035424843264e-05, "loss": 1.067, "step": 11005 }, { "epoch": 0.5499500499500499, "grad_norm": 2.6330723762512207, "learning_rate": 4.5924868749681435e-05, "loss": 1.0776, "step": 11010 }, { "epoch": 0.5501998001998002, "grad_norm": 1.892662763595581, "learning_rate": 4.589938325093022e-05, "loss": 1.0477, "step": 11015 }, { "epoch": 0.5504495504495505, "grad_norm": 1.8991339206695557, "learning_rate": 4.587389775217901e-05, "loss": 0.9288, "step": 11020 }, { "epoch": 0.5506993006993007, "grad_norm": 2.3266944885253906, "learning_rate": 4.5848412253427805e-05, "loss": 0.9725, "step": 11025 }, { "epoch": 0.550949050949051, "grad_norm": 1.9225058555603027, "learning_rate": 4.582292675467659e-05, "loss": 1.0898, "step": 11030 }, { "epoch": 0.5511988011988012, "grad_norm": 1.5720235109329224, "learning_rate": 4.579744125592538e-05, "loss": 0.9273, "step": 11035 }, { "epoch": 0.5514485514485514, "grad_norm": 2.5303800106048584, "learning_rate": 4.577195575717417e-05, "loss": 0.983, "step": 11040 }, { "epoch": 0.5516983016983017, "grad_norm": 1.7467522621154785, "learning_rate": 4.574647025842296e-05, "loss": 1.0635, "step": 11045 }, { "epoch": 0.551948051948052, "grad_norm": 4.610573768615723, "learning_rate": 4.5720984759671746e-05, "loss": 0.9474, "step": 11050 }, { "epoch": 0.5521978021978022, "grad_norm": 1.697086215019226, "learning_rate": 4.569549926092054e-05, "loss": 1.0347, "step": 11055 }, { "epoch": 0.5524475524475524, "grad_norm": 1.6885266304016113, "learning_rate": 4.567001376216933e-05, "loss": 1.028, "step": 11060 }, { "epoch": 0.5526973026973027, "grad_norm": 1.6800806522369385, "learning_rate": 4.5644528263418115e-05, "loss": 0.9873, "step": 11065 }, { "epoch": 0.5529470529470529, "grad_norm": 1.574774146080017, "learning_rate": 4.561904276466691e-05, "loss": 1.1467, "step": 11070 }, { "epoch": 0.5531968031968032, "grad_norm": 1.951435923576355, "learning_rate": 4.55935572659157e-05, "loss": 1.07, "step": 11075 }, { "epoch": 0.5534465534465535, "grad_norm": 1.63338303565979, "learning_rate": 4.5568071767164485e-05, "loss": 1.0613, "step": 11080 }, { "epoch": 0.5536963036963037, "grad_norm": 2.3568294048309326, "learning_rate": 4.554258626841327e-05, "loss": 1.1063, "step": 11085 }, { "epoch": 0.5539460539460539, "grad_norm": 1.819028615951538, "learning_rate": 4.551710076966206e-05, "loss": 1.0106, "step": 11090 }, { "epoch": 0.5541958041958042, "grad_norm": 1.648171067237854, "learning_rate": 4.5491615270910855e-05, "loss": 0.9722, "step": 11095 }, { "epoch": 0.5544455544455544, "grad_norm": 1.9574801921844482, "learning_rate": 4.546612977215964e-05, "loss": 1.016, "step": 11100 }, { "epoch": 0.5546953046953047, "grad_norm": 2.0966663360595703, "learning_rate": 4.544064427340843e-05, "loss": 1.0427, "step": 11105 }, { "epoch": 0.554945054945055, "grad_norm": 2.7019777297973633, "learning_rate": 4.5415158774657225e-05, "loss": 1.0427, "step": 11110 }, { "epoch": 0.5551948051948052, "grad_norm": 2.143784523010254, "learning_rate": 4.538967327590602e-05, "loss": 0.9533, "step": 11115 }, { "epoch": 0.5554445554445554, "grad_norm": 2.0740156173706055, "learning_rate": 4.5364187777154795e-05, "loss": 1.0459, "step": 11120 }, { "epoch": 0.5556943056943057, "grad_norm": 2.227280855178833, "learning_rate": 4.533870227840359e-05, "loss": 1.0225, "step": 11125 }, { "epoch": 0.5559440559440559, "grad_norm": 1.5724329948425293, "learning_rate": 4.531321677965238e-05, "loss": 1.0144, "step": 11130 }, { "epoch": 0.5561938061938062, "grad_norm": 2.626450300216675, "learning_rate": 4.528773128090117e-05, "loss": 1.0759, "step": 11135 }, { "epoch": 0.5564435564435565, "grad_norm": 1.7094043493270874, "learning_rate": 4.526224578214996e-05, "loss": 1.0816, "step": 11140 }, { "epoch": 0.5566933066933067, "grad_norm": 2.2203426361083984, "learning_rate": 4.523676028339875e-05, "loss": 1.0197, "step": 11145 }, { "epoch": 0.5569430569430569, "grad_norm": 1.689178466796875, "learning_rate": 4.521127478464754e-05, "loss": 1.0565, "step": 11150 }, { "epoch": 0.5571928071928072, "grad_norm": 2.219802141189575, "learning_rate": 4.518578928589633e-05, "loss": 0.9702, "step": 11155 }, { "epoch": 0.5574425574425574, "grad_norm": 1.9337729215621948, "learning_rate": 4.516030378714511e-05, "loss": 1.0063, "step": 11160 }, { "epoch": 0.5576923076923077, "grad_norm": 1.5413296222686768, "learning_rate": 4.5134818288393905e-05, "loss": 0.9806, "step": 11165 }, { "epoch": 0.557942057942058, "grad_norm": 2.652912139892578, "learning_rate": 4.51093327896427e-05, "loss": 0.9995, "step": 11170 }, { "epoch": 0.5581918081918081, "grad_norm": 1.9666283130645752, "learning_rate": 4.508384729089148e-05, "loss": 0.9109, "step": 11175 }, { "epoch": 0.5584415584415584, "grad_norm": 1.6014527082443237, "learning_rate": 4.5058361792140274e-05, "loss": 0.9359, "step": 11180 }, { "epoch": 0.5586913086913087, "grad_norm": 2.137362003326416, "learning_rate": 4.503287629338907e-05, "loss": 1.0042, "step": 11185 }, { "epoch": 0.5589410589410589, "grad_norm": 1.5616415739059448, "learning_rate": 4.500739079463785e-05, "loss": 1.0828, "step": 11190 }, { "epoch": 0.5591908091908092, "grad_norm": 1.9336650371551514, "learning_rate": 4.4981905295886644e-05, "loss": 0.9628, "step": 11195 }, { "epoch": 0.5594405594405595, "grad_norm": 1.7945518493652344, "learning_rate": 4.495641979713543e-05, "loss": 0.9809, "step": 11200 }, { "epoch": 0.5596903096903096, "grad_norm": 1.9656338691711426, "learning_rate": 4.493093429838422e-05, "loss": 1.0949, "step": 11205 }, { "epoch": 0.5599400599400599, "grad_norm": 1.8820648193359375, "learning_rate": 4.490544879963301e-05, "loss": 0.9938, "step": 11210 }, { "epoch": 0.5601898101898102, "grad_norm": 2.1812477111816406, "learning_rate": 4.48799633008818e-05, "loss": 1.0363, "step": 11215 }, { "epoch": 0.5604395604395604, "grad_norm": 1.5776993036270142, "learning_rate": 4.485447780213059e-05, "loss": 0.9985, "step": 11220 }, { "epoch": 0.5606893106893107, "grad_norm": 2.6446173191070557, "learning_rate": 4.482899230337938e-05, "loss": 1.1805, "step": 11225 }, { "epoch": 0.560939060939061, "grad_norm": 1.6966280937194824, "learning_rate": 4.480350680462817e-05, "loss": 1.1049, "step": 11230 }, { "epoch": 0.5611888111888111, "grad_norm": 1.566955327987671, "learning_rate": 4.477802130587696e-05, "loss": 0.9611, "step": 11235 }, { "epoch": 0.5614385614385614, "grad_norm": 1.8438693284988403, "learning_rate": 4.475253580712575e-05, "loss": 1.008, "step": 11240 }, { "epoch": 0.5616883116883117, "grad_norm": 1.676999568939209, "learning_rate": 4.472705030837453e-05, "loss": 1.0172, "step": 11245 }, { "epoch": 0.561938061938062, "grad_norm": 1.431728482246399, "learning_rate": 4.4701564809623324e-05, "loss": 1.1322, "step": 11250 }, { "epoch": 0.5621878121878122, "grad_norm": 3.17277455329895, "learning_rate": 4.4676079310872117e-05, "loss": 0.9843, "step": 11255 }, { "epoch": 0.5624375624375625, "grad_norm": 2.399710178375244, "learning_rate": 4.465059381212091e-05, "loss": 1.0954, "step": 11260 }, { "epoch": 0.5626873126873126, "grad_norm": 1.938697099685669, "learning_rate": 4.4625108313369694e-05, "loss": 1.106, "step": 11265 }, { "epoch": 0.5629370629370629, "grad_norm": 1.5464006662368774, "learning_rate": 4.4599622814618486e-05, "loss": 1.0699, "step": 11270 }, { "epoch": 0.5631868131868132, "grad_norm": 2.323896646499634, "learning_rate": 4.457413731586728e-05, "loss": 0.9832, "step": 11275 }, { "epoch": 0.5634365634365635, "grad_norm": 2.4159767627716064, "learning_rate": 4.454865181711606e-05, "loss": 1.0123, "step": 11280 }, { "epoch": 0.5636863136863137, "grad_norm": 2.5501248836517334, "learning_rate": 4.452316631836485e-05, "loss": 0.8968, "step": 11285 }, { "epoch": 0.563936063936064, "grad_norm": 1.85370934009552, "learning_rate": 4.449768081961364e-05, "loss": 1.0231, "step": 11290 }, { "epoch": 0.5641858141858141, "grad_norm": 1.5208295583724976, "learning_rate": 4.4472195320862434e-05, "loss": 1.0702, "step": 11295 }, { "epoch": 0.5644355644355644, "grad_norm": 2.473689317703247, "learning_rate": 4.444670982211122e-05, "loss": 0.9892, "step": 11300 }, { "epoch": 0.5646853146853147, "grad_norm": 1.8165267705917358, "learning_rate": 4.442122432336001e-05, "loss": 1.0255, "step": 11305 }, { "epoch": 0.564935064935065, "grad_norm": 1.846514105796814, "learning_rate": 4.4395738824608803e-05, "loss": 0.9881, "step": 11310 }, { "epoch": 0.5651848151848152, "grad_norm": 1.783138632774353, "learning_rate": 4.437025332585759e-05, "loss": 1.03, "step": 11315 }, { "epoch": 0.5654345654345654, "grad_norm": 2.203596353530884, "learning_rate": 4.4344767827106374e-05, "loss": 0.9951, "step": 11320 }, { "epoch": 0.5656843156843157, "grad_norm": 2.056151866912842, "learning_rate": 4.4319282328355166e-05, "loss": 1.0672, "step": 11325 }, { "epoch": 0.5659340659340659, "grad_norm": 1.8266443014144897, "learning_rate": 4.429379682960396e-05, "loss": 1.1452, "step": 11330 }, { "epoch": 0.5661838161838162, "grad_norm": 2.3899474143981934, "learning_rate": 4.4268311330852744e-05, "loss": 1.0589, "step": 11335 }, { "epoch": 0.5664335664335665, "grad_norm": 2.165414333343506, "learning_rate": 4.4242825832101536e-05, "loss": 0.9656, "step": 11340 }, { "epoch": 0.5666833166833167, "grad_norm": 1.9252963066101074, "learning_rate": 4.421734033335033e-05, "loss": 0.9387, "step": 11345 }, { "epoch": 0.5669330669330669, "grad_norm": 1.8118185997009277, "learning_rate": 4.4191854834599114e-05, "loss": 1.0682, "step": 11350 }, { "epoch": 0.5671828171828172, "grad_norm": 1.7838951349258423, "learning_rate": 4.4166369335847906e-05, "loss": 0.9683, "step": 11355 }, { "epoch": 0.5674325674325674, "grad_norm": 1.6587882041931152, "learning_rate": 4.414088383709669e-05, "loss": 0.9593, "step": 11360 }, { "epoch": 0.5676823176823177, "grad_norm": 2.5877013206481934, "learning_rate": 4.4115398338345484e-05, "loss": 1.0548, "step": 11365 }, { "epoch": 0.567932067932068, "grad_norm": 1.8029611110687256, "learning_rate": 4.408991283959427e-05, "loss": 1.0707, "step": 11370 }, { "epoch": 0.5681818181818182, "grad_norm": 1.9176095724105835, "learning_rate": 4.406442734084306e-05, "loss": 1.0034, "step": 11375 }, { "epoch": 0.5684315684315684, "grad_norm": 4.906855583190918, "learning_rate": 4.403894184209185e-05, "loss": 0.9617, "step": 11380 }, { "epoch": 0.5686813186813187, "grad_norm": 4.21867561340332, "learning_rate": 4.4013456343340646e-05, "loss": 1.0995, "step": 11385 }, { "epoch": 0.5689310689310689, "grad_norm": 2.342682361602783, "learning_rate": 4.398797084458943e-05, "loss": 1.0126, "step": 11390 }, { "epoch": 0.5691808191808192, "grad_norm": 1.6096895933151245, "learning_rate": 4.396248534583822e-05, "loss": 1.117, "step": 11395 }, { "epoch": 0.5694305694305695, "grad_norm": 2.0556085109710693, "learning_rate": 4.393699984708701e-05, "loss": 1.0288, "step": 11400 }, { "epoch": 0.5696803196803197, "grad_norm": 1.9399186372756958, "learning_rate": 4.3911514348335794e-05, "loss": 1.0684, "step": 11405 }, { "epoch": 0.5699300699300699, "grad_norm": 1.7195922136306763, "learning_rate": 4.3886028849584586e-05, "loss": 1.0832, "step": 11410 }, { "epoch": 0.5701798201798202, "grad_norm": 2.51728892326355, "learning_rate": 4.386054335083338e-05, "loss": 1.0522, "step": 11415 }, { "epoch": 0.5704295704295704, "grad_norm": 2.4342949390411377, "learning_rate": 4.383505785208217e-05, "loss": 1.0357, "step": 11420 }, { "epoch": 0.5706793206793207, "grad_norm": 1.8538497686386108, "learning_rate": 4.3809572353330956e-05, "loss": 0.9761, "step": 11425 }, { "epoch": 0.570929070929071, "grad_norm": 2.0012903213500977, "learning_rate": 4.378408685457975e-05, "loss": 1.0035, "step": 11430 }, { "epoch": 0.5711788211788211, "grad_norm": 1.8685760498046875, "learning_rate": 4.375860135582854e-05, "loss": 1.0339, "step": 11435 }, { "epoch": 0.5714285714285714, "grad_norm": 1.8456110954284668, "learning_rate": 4.3733115857077326e-05, "loss": 1.0574, "step": 11440 }, { "epoch": 0.5716783216783217, "grad_norm": 1.739162802696228, "learning_rate": 4.370763035832611e-05, "loss": 1.0897, "step": 11445 }, { "epoch": 0.5719280719280719, "grad_norm": 1.896079421043396, "learning_rate": 4.36821448595749e-05, "loss": 0.9927, "step": 11450 }, { "epoch": 0.5721778221778222, "grad_norm": 1.8981748819351196, "learning_rate": 4.3656659360823695e-05, "loss": 1.096, "step": 11455 }, { "epoch": 0.5724275724275725, "grad_norm": 2.2317261695861816, "learning_rate": 4.363117386207248e-05, "loss": 0.943, "step": 11460 }, { "epoch": 0.5726773226773226, "grad_norm": 1.697849154472351, "learning_rate": 4.360568836332127e-05, "loss": 1.0972, "step": 11465 }, { "epoch": 0.5729270729270729, "grad_norm": 1.5122568607330322, "learning_rate": 4.3580202864570065e-05, "loss": 1.0246, "step": 11470 }, { "epoch": 0.5731768231768232, "grad_norm": 1.5713346004486084, "learning_rate": 4.355471736581885e-05, "loss": 0.95, "step": 11475 }, { "epoch": 0.5734265734265734, "grad_norm": 2.1044158935546875, "learning_rate": 4.3529231867067636e-05, "loss": 1.0462, "step": 11480 }, { "epoch": 0.5736763236763237, "grad_norm": 1.8352504968643188, "learning_rate": 4.350374636831643e-05, "loss": 0.9725, "step": 11485 }, { "epoch": 0.573926073926074, "grad_norm": 1.754981279373169, "learning_rate": 4.347826086956522e-05, "loss": 0.9413, "step": 11490 }, { "epoch": 0.5741758241758241, "grad_norm": 1.8544729948043823, "learning_rate": 4.3452775370814006e-05, "loss": 1.0305, "step": 11495 }, { "epoch": 0.5744255744255744, "grad_norm": 2.122689962387085, "learning_rate": 4.34272898720628e-05, "loss": 0.9541, "step": 11500 }, { "epoch": 0.5746753246753247, "grad_norm": 2.3668196201324463, "learning_rate": 4.340180437331159e-05, "loss": 1.0576, "step": 11505 }, { "epoch": 0.5749250749250749, "grad_norm": 1.8058409690856934, "learning_rate": 4.3376318874560376e-05, "loss": 0.965, "step": 11510 }, { "epoch": 0.5751748251748252, "grad_norm": 1.9096729755401611, "learning_rate": 4.335083337580917e-05, "loss": 0.9392, "step": 11515 }, { "epoch": 0.5754245754245755, "grad_norm": 1.6076315641403198, "learning_rate": 4.332534787705795e-05, "loss": 0.9754, "step": 11520 }, { "epoch": 0.5756743256743256, "grad_norm": 1.5563503503799438, "learning_rate": 4.3299862378306745e-05, "loss": 1.1616, "step": 11525 }, { "epoch": 0.5759240759240759, "grad_norm": 1.3561556339263916, "learning_rate": 4.327437687955553e-05, "loss": 0.9601, "step": 11530 }, { "epoch": 0.5761738261738262, "grad_norm": 2.213383913040161, "learning_rate": 4.324889138080432e-05, "loss": 1.0573, "step": 11535 }, { "epoch": 0.5764235764235764, "grad_norm": 1.9381892681121826, "learning_rate": 4.3223405882053115e-05, "loss": 1.0524, "step": 11540 }, { "epoch": 0.5766733266733267, "grad_norm": 1.5318629741668701, "learning_rate": 4.319792038330191e-05, "loss": 1.0109, "step": 11545 }, { "epoch": 0.5769230769230769, "grad_norm": 1.5473605394363403, "learning_rate": 4.317243488455069e-05, "loss": 1.0943, "step": 11550 }, { "epoch": 0.5771728271728271, "grad_norm": 1.9802182912826538, "learning_rate": 4.3146949385799485e-05, "loss": 1.0674, "step": 11555 }, { "epoch": 0.5774225774225774, "grad_norm": 2.255134105682373, "learning_rate": 4.312146388704827e-05, "loss": 1.0427, "step": 11560 }, { "epoch": 0.5776723276723277, "grad_norm": 1.7673649787902832, "learning_rate": 4.309597838829706e-05, "loss": 1.1212, "step": 11565 }, { "epoch": 0.577922077922078, "grad_norm": 2.102323293685913, "learning_rate": 4.307049288954585e-05, "loss": 0.9947, "step": 11570 }, { "epoch": 0.5781718281718282, "grad_norm": 1.7684345245361328, "learning_rate": 4.304500739079464e-05, "loss": 1.0041, "step": 11575 }, { "epoch": 0.5784215784215784, "grad_norm": 1.838768482208252, "learning_rate": 4.301952189204343e-05, "loss": 1.1076, "step": 11580 }, { "epoch": 0.5786713286713286, "grad_norm": 1.6242948770523071, "learning_rate": 4.299403639329222e-05, "loss": 0.9626, "step": 11585 }, { "epoch": 0.5789210789210789, "grad_norm": 2.3542561531066895, "learning_rate": 4.296855089454101e-05, "loss": 1.0971, "step": 11590 }, { "epoch": 0.5791708291708292, "grad_norm": 1.8393601179122925, "learning_rate": 4.29430653957898e-05, "loss": 0.9999, "step": 11595 }, { "epoch": 0.5794205794205795, "grad_norm": 1.826846957206726, "learning_rate": 4.291757989703859e-05, "loss": 1.0211, "step": 11600 }, { "epoch": 0.5796703296703297, "grad_norm": 1.9960795640945435, "learning_rate": 4.289209439828737e-05, "loss": 1.1369, "step": 11605 }, { "epoch": 0.5799200799200799, "grad_norm": 1.975723147392273, "learning_rate": 4.2866608899536165e-05, "loss": 1.0836, "step": 11610 }, { "epoch": 0.5801698301698301, "grad_norm": 2.0478694438934326, "learning_rate": 4.284112340078496e-05, "loss": 1.0006, "step": 11615 }, { "epoch": 0.5804195804195804, "grad_norm": 1.7400479316711426, "learning_rate": 4.281563790203374e-05, "loss": 1.0532, "step": 11620 }, { "epoch": 0.5806693306693307, "grad_norm": 1.8064656257629395, "learning_rate": 4.2790152403282535e-05, "loss": 1.0062, "step": 11625 }, { "epoch": 0.580919080919081, "grad_norm": 2.2288122177124023, "learning_rate": 4.276466690453133e-05, "loss": 1.06, "step": 11630 }, { "epoch": 0.5811688311688312, "grad_norm": 1.748428225517273, "learning_rate": 4.273918140578011e-05, "loss": 0.9499, "step": 11635 }, { "epoch": 0.5814185814185814, "grad_norm": 1.9746248722076416, "learning_rate": 4.27136959070289e-05, "loss": 0.9719, "step": 11640 }, { "epoch": 0.5816683316683317, "grad_norm": 2.6370203495025635, "learning_rate": 4.268821040827769e-05, "loss": 0.9954, "step": 11645 }, { "epoch": 0.5819180819180819, "grad_norm": 1.7931537628173828, "learning_rate": 4.266272490952648e-05, "loss": 1.1225, "step": 11650 }, { "epoch": 0.5821678321678322, "grad_norm": 1.5068913698196411, "learning_rate": 4.263723941077527e-05, "loss": 0.9151, "step": 11655 }, { "epoch": 0.5824175824175825, "grad_norm": 1.9279522895812988, "learning_rate": 4.261175391202406e-05, "loss": 1.0463, "step": 11660 }, { "epoch": 0.5826673326673326, "grad_norm": 2.191014051437378, "learning_rate": 4.258626841327285e-05, "loss": 1.0868, "step": 11665 }, { "epoch": 0.5829170829170829, "grad_norm": 2.1446564197540283, "learning_rate": 4.2560782914521644e-05, "loss": 1.0077, "step": 11670 }, { "epoch": 0.5831668331668332, "grad_norm": 1.7688651084899902, "learning_rate": 4.253529741577043e-05, "loss": 1.077, "step": 11675 }, { "epoch": 0.5834165834165834, "grad_norm": 1.9743175506591797, "learning_rate": 4.2509811917019215e-05, "loss": 1.1338, "step": 11680 }, { "epoch": 0.5836663336663337, "grad_norm": 1.9691134691238403, "learning_rate": 4.248432641826801e-05, "loss": 0.9539, "step": 11685 }, { "epoch": 0.583916083916084, "grad_norm": 1.738929033279419, "learning_rate": 4.245884091951679e-05, "loss": 1.0344, "step": 11690 }, { "epoch": 0.5841658341658341, "grad_norm": 3.5216622352600098, "learning_rate": 4.2433355420765585e-05, "loss": 1.0897, "step": 11695 }, { "epoch": 0.5844155844155844, "grad_norm": 1.8885751962661743, "learning_rate": 4.240786992201438e-05, "loss": 1.027, "step": 11700 }, { "epoch": 0.5846653346653347, "grad_norm": 1.8530664443969727, "learning_rate": 4.238238442326317e-05, "loss": 1.0907, "step": 11705 }, { "epoch": 0.5849150849150849, "grad_norm": 1.7301623821258545, "learning_rate": 4.2356898924511954e-05, "loss": 0.9757, "step": 11710 }, { "epoch": 0.5851648351648352, "grad_norm": 2.3676650524139404, "learning_rate": 4.2331413425760747e-05, "loss": 0.9865, "step": 11715 }, { "epoch": 0.5854145854145855, "grad_norm": 1.6462810039520264, "learning_rate": 4.230592792700953e-05, "loss": 0.9984, "step": 11720 }, { "epoch": 0.5856643356643356, "grad_norm": 1.924193263053894, "learning_rate": 4.2280442428258324e-05, "loss": 1.071, "step": 11725 }, { "epoch": 0.5859140859140859, "grad_norm": 2.327021837234497, "learning_rate": 4.225495692950711e-05, "loss": 0.9869, "step": 11730 }, { "epoch": 0.5861638361638362, "grad_norm": 1.874643325805664, "learning_rate": 4.22294714307559e-05, "loss": 1.0663, "step": 11735 }, { "epoch": 0.5864135864135864, "grad_norm": 2.0613934993743896, "learning_rate": 4.2203985932004694e-05, "loss": 1.1587, "step": 11740 }, { "epoch": 0.5866633366633367, "grad_norm": 1.9255084991455078, "learning_rate": 4.217850043325348e-05, "loss": 0.9717, "step": 11745 }, { "epoch": 0.586913086913087, "grad_norm": 1.907364845275879, "learning_rate": 4.215301493450227e-05, "loss": 1.0746, "step": 11750 }, { "epoch": 0.5871628371628371, "grad_norm": 1.862642526626587, "learning_rate": 4.2127529435751064e-05, "loss": 0.9524, "step": 11755 }, { "epoch": 0.5874125874125874, "grad_norm": 1.7797350883483887, "learning_rate": 4.210204393699985e-05, "loss": 1.0168, "step": 11760 }, { "epoch": 0.5876623376623377, "grad_norm": 1.4859908819198608, "learning_rate": 4.2076558438248635e-05, "loss": 1.1032, "step": 11765 }, { "epoch": 0.5879120879120879, "grad_norm": 1.850453495979309, "learning_rate": 4.205107293949743e-05, "loss": 0.9912, "step": 11770 }, { "epoch": 0.5881618381618382, "grad_norm": 1.781798243522644, "learning_rate": 4.202558744074622e-05, "loss": 1.04, "step": 11775 }, { "epoch": 0.5884115884115884, "grad_norm": 3.1102981567382812, "learning_rate": 4.2000101941995004e-05, "loss": 1.0501, "step": 11780 }, { "epoch": 0.5886613386613386, "grad_norm": 1.7622472047805786, "learning_rate": 4.1974616443243796e-05, "loss": 0.9883, "step": 11785 }, { "epoch": 0.5889110889110889, "grad_norm": 1.9252958297729492, "learning_rate": 4.194913094449259e-05, "loss": 1.015, "step": 11790 }, { "epoch": 0.5891608391608392, "grad_norm": 1.702030897140503, "learning_rate": 4.1923645445741374e-05, "loss": 0.998, "step": 11795 }, { "epoch": 0.5894105894105894, "grad_norm": 2.4419209957122803, "learning_rate": 4.189815994699016e-05, "loss": 1.0478, "step": 11800 }, { "epoch": 0.5896603396603397, "grad_norm": 1.825932502746582, "learning_rate": 4.187267444823895e-05, "loss": 1.023, "step": 11805 }, { "epoch": 0.5899100899100899, "grad_norm": 1.7342149019241333, "learning_rate": 4.1847188949487744e-05, "loss": 0.9855, "step": 11810 }, { "epoch": 0.5901598401598401, "grad_norm": 2.2588565349578857, "learning_rate": 4.182170345073653e-05, "loss": 1.1183, "step": 11815 }, { "epoch": 0.5904095904095904, "grad_norm": 2.0705018043518066, "learning_rate": 4.179621795198532e-05, "loss": 1.0239, "step": 11820 }, { "epoch": 0.5906593406593407, "grad_norm": 2.0090487003326416, "learning_rate": 4.1770732453234114e-05, "loss": 0.9447, "step": 11825 }, { "epoch": 0.5909090909090909, "grad_norm": 1.9004836082458496, "learning_rate": 4.1745246954482906e-05, "loss": 0.9523, "step": 11830 }, { "epoch": 0.5911588411588412, "grad_norm": 2.2067155838012695, "learning_rate": 4.171976145573169e-05, "loss": 0.9617, "step": 11835 }, { "epoch": 0.5914085914085914, "grad_norm": 2.208631992340088, "learning_rate": 4.1694275956980477e-05, "loss": 1.0681, "step": 11840 }, { "epoch": 0.5916583416583416, "grad_norm": 2.385709047317505, "learning_rate": 4.166879045822927e-05, "loss": 1.0285, "step": 11845 }, { "epoch": 0.5919080919080919, "grad_norm": 1.8119724988937378, "learning_rate": 4.164330495947806e-05, "loss": 1.0889, "step": 11850 }, { "epoch": 0.5921578421578422, "grad_norm": 2.0381417274475098, "learning_rate": 4.1617819460726846e-05, "loss": 1.0501, "step": 11855 }, { "epoch": 0.5924075924075924, "grad_norm": 2.133204221725464, "learning_rate": 4.159233396197564e-05, "loss": 1.0468, "step": 11860 }, { "epoch": 0.5926573426573427, "grad_norm": 1.617150068283081, "learning_rate": 4.156684846322443e-05, "loss": 1.0091, "step": 11865 }, { "epoch": 0.5929070929070929, "grad_norm": 3.3665459156036377, "learning_rate": 4.1541362964473216e-05, "loss": 0.9377, "step": 11870 }, { "epoch": 0.5931568431568431, "grad_norm": 1.868099331855774, "learning_rate": 4.151587746572201e-05, "loss": 1.0542, "step": 11875 }, { "epoch": 0.5934065934065934, "grad_norm": 1.7966622114181519, "learning_rate": 4.1490391966970794e-05, "loss": 1.0079, "step": 11880 }, { "epoch": 0.5936563436563437, "grad_norm": 2.4564294815063477, "learning_rate": 4.1464906468219586e-05, "loss": 1.0176, "step": 11885 }, { "epoch": 0.593906093906094, "grad_norm": 1.6721248626708984, "learning_rate": 4.143942096946837e-05, "loss": 1.0127, "step": 11890 }, { "epoch": 0.5941558441558441, "grad_norm": 1.5256189107894897, "learning_rate": 4.1413935470717163e-05, "loss": 0.9883, "step": 11895 }, { "epoch": 0.5944055944055944, "grad_norm": 2.2357513904571533, "learning_rate": 4.1388449971965956e-05, "loss": 1.0672, "step": 11900 }, { "epoch": 0.5946553446553446, "grad_norm": 1.638623595237732, "learning_rate": 4.136296447321474e-05, "loss": 1.0632, "step": 11905 }, { "epoch": 0.5949050949050949, "grad_norm": 2.3139448165893555, "learning_rate": 4.133747897446353e-05, "loss": 0.9725, "step": 11910 }, { "epoch": 0.5951548451548452, "grad_norm": 1.6423486471176147, "learning_rate": 4.1311993475712325e-05, "loss": 1.0063, "step": 11915 }, { "epoch": 0.5954045954045954, "grad_norm": 1.7153905630111694, "learning_rate": 4.128650797696111e-05, "loss": 0.9406, "step": 11920 }, { "epoch": 0.5956543456543456, "grad_norm": 2.050271987915039, "learning_rate": 4.1261022478209896e-05, "loss": 0.9839, "step": 11925 }, { "epoch": 0.5959040959040959, "grad_norm": 1.7814981937408447, "learning_rate": 4.123553697945869e-05, "loss": 0.9238, "step": 11930 }, { "epoch": 0.5961538461538461, "grad_norm": 2.710594654083252, "learning_rate": 4.121005148070748e-05, "loss": 1.1227, "step": 11935 }, { "epoch": 0.5964035964035964, "grad_norm": 1.4550127983093262, "learning_rate": 4.1184565981956266e-05, "loss": 1.0417, "step": 11940 }, { "epoch": 0.5966533466533467, "grad_norm": 1.7633469104766846, "learning_rate": 4.115908048320506e-05, "loss": 0.9335, "step": 11945 }, { "epoch": 0.596903096903097, "grad_norm": 2.116783857345581, "learning_rate": 4.113359498445385e-05, "loss": 0.9367, "step": 11950 }, { "epoch": 0.5971528471528471, "grad_norm": 1.9305974245071411, "learning_rate": 4.110810948570264e-05, "loss": 1.0851, "step": 11955 }, { "epoch": 0.5974025974025974, "grad_norm": 1.7889987230300903, "learning_rate": 4.108262398695142e-05, "loss": 1.0817, "step": 11960 }, { "epoch": 0.5976523476523476, "grad_norm": 1.7660090923309326, "learning_rate": 4.105713848820021e-05, "loss": 0.9868, "step": 11965 }, { "epoch": 0.5979020979020979, "grad_norm": 1.816837191581726, "learning_rate": 4.1031652989449006e-05, "loss": 1.0071, "step": 11970 }, { "epoch": 0.5981518481518482, "grad_norm": 1.8390569686889648, "learning_rate": 4.10061674906978e-05, "loss": 1.0077, "step": 11975 }, { "epoch": 0.5984015984015985, "grad_norm": 1.7051702737808228, "learning_rate": 4.098068199194658e-05, "loss": 0.9963, "step": 11980 }, { "epoch": 0.5986513486513486, "grad_norm": 1.8439427614212036, "learning_rate": 4.0955196493195375e-05, "loss": 1.0433, "step": 11985 }, { "epoch": 0.5989010989010989, "grad_norm": 2.1053481101989746, "learning_rate": 4.092971099444417e-05, "loss": 0.9978, "step": 11990 }, { "epoch": 0.5991508491508492, "grad_norm": 1.7521613836288452, "learning_rate": 4.090422549569295e-05, "loss": 1.0194, "step": 11995 }, { "epoch": 0.5994005994005994, "grad_norm": 1.6349599361419678, "learning_rate": 4.087873999694174e-05, "loss": 0.9826, "step": 12000 }, { "epoch": 0.5996503496503497, "grad_norm": 1.934890627861023, "learning_rate": 4.085325449819053e-05, "loss": 1.0449, "step": 12005 }, { "epoch": 0.5999000999000998, "grad_norm": 2.116364002227783, "learning_rate": 4.082776899943932e-05, "loss": 1.0738, "step": 12010 }, { "epoch": 0.6001498501498501, "grad_norm": 2.13967227935791, "learning_rate": 4.080228350068811e-05, "loss": 0.9433, "step": 12015 }, { "epoch": 0.6003996003996004, "grad_norm": 1.6893976926803589, "learning_rate": 4.07767980019369e-05, "loss": 0.9982, "step": 12020 }, { "epoch": 0.6006493506493507, "grad_norm": 1.7289113998413086, "learning_rate": 4.075131250318569e-05, "loss": 1.0936, "step": 12025 }, { "epoch": 0.6008991008991009, "grad_norm": 2.3046069145202637, "learning_rate": 4.072582700443448e-05, "loss": 1.0357, "step": 12030 }, { "epoch": 0.6011488511488512, "grad_norm": 1.8844445943832397, "learning_rate": 4.070034150568327e-05, "loss": 0.9107, "step": 12035 }, { "epoch": 0.6013986013986014, "grad_norm": 2.6080665588378906, "learning_rate": 4.0674856006932055e-05, "loss": 0.9777, "step": 12040 }, { "epoch": 0.6016483516483516, "grad_norm": 1.676339030265808, "learning_rate": 4.064937050818085e-05, "loss": 1.1136, "step": 12045 }, { "epoch": 0.6018981018981019, "grad_norm": 1.9086414575576782, "learning_rate": 4.062388500942963e-05, "loss": 1.0615, "step": 12050 }, { "epoch": 0.6021478521478522, "grad_norm": 1.8902066946029663, "learning_rate": 4.0598399510678425e-05, "loss": 1.086, "step": 12055 }, { "epoch": 0.6023976023976024, "grad_norm": 1.7325515747070312, "learning_rate": 4.057291401192722e-05, "loss": 1.1049, "step": 12060 }, { "epoch": 0.6026473526473527, "grad_norm": 1.7236831188201904, "learning_rate": 4.0547428513176e-05, "loss": 1.054, "step": 12065 }, { "epoch": 0.6028971028971029, "grad_norm": 1.7189269065856934, "learning_rate": 4.0521943014424795e-05, "loss": 1.0784, "step": 12070 }, { "epoch": 0.6031468531468531, "grad_norm": 1.7672425508499146, "learning_rate": 4.049645751567359e-05, "loss": 0.9708, "step": 12075 }, { "epoch": 0.6033966033966034, "grad_norm": 1.7705951929092407, "learning_rate": 4.047097201692237e-05, "loss": 1.0841, "step": 12080 }, { "epoch": 0.6036463536463537, "grad_norm": 1.9714468717575073, "learning_rate": 4.044548651817116e-05, "loss": 0.9427, "step": 12085 }, { "epoch": 0.6038961038961039, "grad_norm": 1.8555059432983398, "learning_rate": 4.042000101941995e-05, "loss": 0.9477, "step": 12090 }, { "epoch": 0.6041458541458542, "grad_norm": 1.6516308784484863, "learning_rate": 4.039451552066874e-05, "loss": 0.9835, "step": 12095 }, { "epoch": 0.6043956043956044, "grad_norm": 1.6227937936782837, "learning_rate": 4.036903002191753e-05, "loss": 0.9977, "step": 12100 }, { "epoch": 0.6046453546453546, "grad_norm": 1.9059734344482422, "learning_rate": 4.034354452316632e-05, "loss": 1.0802, "step": 12105 }, { "epoch": 0.6048951048951049, "grad_norm": 1.9279249906539917, "learning_rate": 4.031805902441511e-05, "loss": 1.0785, "step": 12110 }, { "epoch": 0.6051448551448552, "grad_norm": 1.5602335929870605, "learning_rate": 4.0292573525663904e-05, "loss": 1.0493, "step": 12115 }, { "epoch": 0.6053946053946054, "grad_norm": 1.313078761100769, "learning_rate": 4.026708802691268e-05, "loss": 1.0136, "step": 12120 }, { "epoch": 0.6056443556443556, "grad_norm": 1.6762309074401855, "learning_rate": 4.0241602528161475e-05, "loss": 0.9881, "step": 12125 }, { "epoch": 0.6058941058941059, "grad_norm": 1.6275943517684937, "learning_rate": 4.021611702941027e-05, "loss": 0.9554, "step": 12130 }, { "epoch": 0.6061438561438561, "grad_norm": 1.7676833868026733, "learning_rate": 4.019063153065906e-05, "loss": 0.9425, "step": 12135 }, { "epoch": 0.6063936063936064, "grad_norm": 1.770856261253357, "learning_rate": 4.0165146031907845e-05, "loss": 1.0954, "step": 12140 }, { "epoch": 0.6066433566433567, "grad_norm": 2.1002392768859863, "learning_rate": 4.013966053315664e-05, "loss": 1.0369, "step": 12145 }, { "epoch": 0.6068931068931069, "grad_norm": 2.0773496627807617, "learning_rate": 4.011417503440543e-05, "loss": 1.0066, "step": 12150 }, { "epoch": 0.6071428571428571, "grad_norm": 1.7726023197174072, "learning_rate": 4.0088689535654215e-05, "loss": 0.9021, "step": 12155 }, { "epoch": 0.6073926073926074, "grad_norm": 1.8271493911743164, "learning_rate": 4.0063204036903e-05, "loss": 1.0189, "step": 12160 }, { "epoch": 0.6076423576423576, "grad_norm": 2.2117934226989746, "learning_rate": 4.003771853815179e-05, "loss": 0.9903, "step": 12165 }, { "epoch": 0.6078921078921079, "grad_norm": 1.451125144958496, "learning_rate": 4.0012233039400584e-05, "loss": 1.1038, "step": 12170 }, { "epoch": 0.6081418581418582, "grad_norm": 1.5099107027053833, "learning_rate": 3.998674754064937e-05, "loss": 1.0117, "step": 12175 }, { "epoch": 0.6083916083916084, "grad_norm": 2.3489513397216797, "learning_rate": 3.996126204189816e-05, "loss": 1.0505, "step": 12180 }, { "epoch": 0.6086413586413586, "grad_norm": 1.8818230628967285, "learning_rate": 3.9935776543146954e-05, "loss": 1.058, "step": 12185 }, { "epoch": 0.6088911088911089, "grad_norm": 1.4845952987670898, "learning_rate": 3.991029104439574e-05, "loss": 0.9691, "step": 12190 }, { "epoch": 0.6091408591408591, "grad_norm": 1.9357770681381226, "learning_rate": 3.988480554564453e-05, "loss": 1.0312, "step": 12195 }, { "epoch": 0.6093906093906094, "grad_norm": 1.9364995956420898, "learning_rate": 3.985932004689332e-05, "loss": 0.926, "step": 12200 }, { "epoch": 0.6096403596403597, "grad_norm": 1.7030882835388184, "learning_rate": 3.983383454814211e-05, "loss": 0.9604, "step": 12205 }, { "epoch": 0.6098901098901099, "grad_norm": 2.1250898838043213, "learning_rate": 3.9808349049390895e-05, "loss": 0.9442, "step": 12210 }, { "epoch": 0.6101398601398601, "grad_norm": 2.9585723876953125, "learning_rate": 3.978286355063969e-05, "loss": 0.9657, "step": 12215 }, { "epoch": 0.6103896103896104, "grad_norm": 2.063505172729492, "learning_rate": 3.975737805188848e-05, "loss": 1.1325, "step": 12220 }, { "epoch": 0.6106393606393606, "grad_norm": 2.14145827293396, "learning_rate": 3.9731892553137264e-05, "loss": 0.9879, "step": 12225 }, { "epoch": 0.6108891108891109, "grad_norm": 1.5676467418670654, "learning_rate": 3.970640705438606e-05, "loss": 0.9292, "step": 12230 }, { "epoch": 0.6111388611388612, "grad_norm": 1.8991336822509766, "learning_rate": 3.968092155563485e-05, "loss": 1.0275, "step": 12235 }, { "epoch": 0.6113886113886113, "grad_norm": 2.3709137439727783, "learning_rate": 3.9655436056883634e-05, "loss": 0.9768, "step": 12240 }, { "epoch": 0.6116383616383616, "grad_norm": 2.0699422359466553, "learning_rate": 3.962995055813242e-05, "loss": 1.051, "step": 12245 }, { "epoch": 0.6118881118881119, "grad_norm": 1.7920935153961182, "learning_rate": 3.960446505938121e-05, "loss": 1.06, "step": 12250 }, { "epoch": 0.6121378621378621, "grad_norm": 1.708717703819275, "learning_rate": 3.9578979560630004e-05, "loss": 1.0489, "step": 12255 }, { "epoch": 0.6123876123876124, "grad_norm": 2.882168769836426, "learning_rate": 3.9553494061878796e-05, "loss": 1.0802, "step": 12260 }, { "epoch": 0.6126373626373627, "grad_norm": 1.9551362991333008, "learning_rate": 3.952800856312758e-05, "loss": 0.9459, "step": 12265 }, { "epoch": 0.6128871128871128, "grad_norm": 1.4344658851623535, "learning_rate": 3.9502523064376374e-05, "loss": 1.1256, "step": 12270 }, { "epoch": 0.6131368631368631, "grad_norm": 2.396420955657959, "learning_rate": 3.9477037565625166e-05, "loss": 1.0576, "step": 12275 }, { "epoch": 0.6133866133866134, "grad_norm": 1.9305007457733154, "learning_rate": 3.9451552066873945e-05, "loss": 0.9297, "step": 12280 }, { "epoch": 0.6136363636363636, "grad_norm": 1.9748773574829102, "learning_rate": 3.942606656812274e-05, "loss": 1.0911, "step": 12285 }, { "epoch": 0.6138861138861139, "grad_norm": 1.6783887147903442, "learning_rate": 3.940058106937153e-05, "loss": 1.0523, "step": 12290 }, { "epoch": 0.6141358641358642, "grad_norm": 1.6354718208312988, "learning_rate": 3.937509557062032e-05, "loss": 1.0363, "step": 12295 }, { "epoch": 0.6143856143856143, "grad_norm": 2.024932384490967, "learning_rate": 3.9349610071869107e-05, "loss": 1.0091, "step": 12300 }, { "epoch": 0.6146353646353646, "grad_norm": 1.8973650932312012, "learning_rate": 3.93241245731179e-05, "loss": 1.1105, "step": 12305 }, { "epoch": 0.6148851148851149, "grad_norm": 1.9965786933898926, "learning_rate": 3.929863907436669e-05, "loss": 1.0314, "step": 12310 }, { "epoch": 0.6151348651348651, "grad_norm": 2.229177236557007, "learning_rate": 3.9273153575615476e-05, "loss": 1.1753, "step": 12315 }, { "epoch": 0.6153846153846154, "grad_norm": 1.5248008966445923, "learning_rate": 3.924766807686426e-05, "loss": 0.9849, "step": 12320 }, { "epoch": 0.6156343656343657, "grad_norm": 1.5948240756988525, "learning_rate": 3.9222182578113054e-05, "loss": 0.925, "step": 12325 }, { "epoch": 0.6158841158841158, "grad_norm": 2.0162487030029297, "learning_rate": 3.9196697079361846e-05, "loss": 1.0369, "step": 12330 }, { "epoch": 0.6161338661338661, "grad_norm": 1.733738660812378, "learning_rate": 3.917121158061063e-05, "loss": 1.0598, "step": 12335 }, { "epoch": 0.6163836163836164, "grad_norm": 2.268946409225464, "learning_rate": 3.9145726081859424e-05, "loss": 0.9763, "step": 12340 }, { "epoch": 0.6166333666333667, "grad_norm": 3.2546005249023438, "learning_rate": 3.9120240583108216e-05, "loss": 1.0129, "step": 12345 }, { "epoch": 0.6168831168831169, "grad_norm": 1.7366929054260254, "learning_rate": 3.9094755084357e-05, "loss": 0.9644, "step": 12350 }, { "epoch": 0.6171328671328671, "grad_norm": 2.224156379699707, "learning_rate": 3.9069269585605793e-05, "loss": 0.9421, "step": 12355 }, { "epoch": 0.6173826173826173, "grad_norm": 1.8952362537384033, "learning_rate": 3.904378408685458e-05, "loss": 0.91, "step": 12360 }, { "epoch": 0.6176323676323676, "grad_norm": 1.962277889251709, "learning_rate": 3.901829858810337e-05, "loss": 1.0291, "step": 12365 }, { "epoch": 0.6178821178821179, "grad_norm": 1.8677723407745361, "learning_rate": 3.8992813089352156e-05, "loss": 1.1902, "step": 12370 }, { "epoch": 0.6181318681318682, "grad_norm": 2.2777414321899414, "learning_rate": 3.896732759060095e-05, "loss": 1.0686, "step": 12375 }, { "epoch": 0.6183816183816184, "grad_norm": 2.0545907020568848, "learning_rate": 3.894184209184974e-05, "loss": 1.0817, "step": 12380 }, { "epoch": 0.6186313686313686, "grad_norm": 1.590526819229126, "learning_rate": 3.891635659309853e-05, "loss": 1.0149, "step": 12385 }, { "epoch": 0.6188811188811189, "grad_norm": 2.3402366638183594, "learning_rate": 3.889087109434732e-05, "loss": 1.0532, "step": 12390 }, { "epoch": 0.6191308691308691, "grad_norm": 1.7427630424499512, "learning_rate": 3.886538559559611e-05, "loss": 1.0894, "step": 12395 }, { "epoch": 0.6193806193806194, "grad_norm": 2.0278737545013428, "learning_rate": 3.8839900096844896e-05, "loss": 0.995, "step": 12400 }, { "epoch": 0.6196303696303697, "grad_norm": 2.192770004272461, "learning_rate": 3.881441459809368e-05, "loss": 0.9772, "step": 12405 }, { "epoch": 0.6198801198801199, "grad_norm": 1.437747836112976, "learning_rate": 3.8788929099342474e-05, "loss": 0.8787, "step": 12410 }, { "epoch": 0.6201298701298701, "grad_norm": 1.821458101272583, "learning_rate": 3.8763443600591266e-05, "loss": 1.081, "step": 12415 }, { "epoch": 0.6203796203796204, "grad_norm": 1.5295093059539795, "learning_rate": 3.873795810184006e-05, "loss": 1.0704, "step": 12420 }, { "epoch": 0.6206293706293706, "grad_norm": 1.5165185928344727, "learning_rate": 3.871247260308884e-05, "loss": 1.0616, "step": 12425 }, { "epoch": 0.6208791208791209, "grad_norm": 1.9153774976730347, "learning_rate": 3.8686987104337636e-05, "loss": 0.9856, "step": 12430 }, { "epoch": 0.6211288711288712, "grad_norm": 1.662726879119873, "learning_rate": 3.866150160558643e-05, "loss": 0.9911, "step": 12435 }, { "epoch": 0.6213786213786214, "grad_norm": 2.195155382156372, "learning_rate": 3.863601610683521e-05, "loss": 1.0882, "step": 12440 }, { "epoch": 0.6216283716283716, "grad_norm": 1.6951839923858643, "learning_rate": 3.8610530608084e-05, "loss": 1.003, "step": 12445 }, { "epoch": 0.6218781218781219, "grad_norm": 1.7599847316741943, "learning_rate": 3.858504510933279e-05, "loss": 1.0503, "step": 12450 }, { "epoch": 0.6221278721278721, "grad_norm": 1.9391034841537476, "learning_rate": 3.855955961058158e-05, "loss": 1.0668, "step": 12455 }, { "epoch": 0.6223776223776224, "grad_norm": 2.1306405067443848, "learning_rate": 3.853407411183037e-05, "loss": 0.9685, "step": 12460 }, { "epoch": 0.6226273726273727, "grad_norm": 2.0500776767730713, "learning_rate": 3.850858861307916e-05, "loss": 1.0, "step": 12465 }, { "epoch": 0.6228771228771228, "grad_norm": 1.8707716464996338, "learning_rate": 3.848310311432795e-05, "loss": 0.9673, "step": 12470 }, { "epoch": 0.6231268731268731, "grad_norm": 2.3700780868530273, "learning_rate": 3.845761761557674e-05, "loss": 1.0644, "step": 12475 }, { "epoch": 0.6233766233766234, "grad_norm": 1.7207790613174438, "learning_rate": 3.8432132116825523e-05, "loss": 0.9648, "step": 12480 }, { "epoch": 0.6236263736263736, "grad_norm": 1.7474174499511719, "learning_rate": 3.8406646618074316e-05, "loss": 0.995, "step": 12485 }, { "epoch": 0.6238761238761239, "grad_norm": 1.7458157539367676, "learning_rate": 3.838116111932311e-05, "loss": 0.9868, "step": 12490 }, { "epoch": 0.6241258741258742, "grad_norm": 1.8571330308914185, "learning_rate": 3.835567562057189e-05, "loss": 0.9391, "step": 12495 }, { "epoch": 0.6243756243756243, "grad_norm": 1.882039189338684, "learning_rate": 3.8330190121820685e-05, "loss": 0.9771, "step": 12500 }, { "epoch": 0.6246253746253746, "grad_norm": 2.1027004718780518, "learning_rate": 3.830470462306948e-05, "loss": 1.0408, "step": 12505 }, { "epoch": 0.6248751248751249, "grad_norm": 2.246072769165039, "learning_rate": 3.827921912431826e-05, "loss": 1.0948, "step": 12510 }, { "epoch": 0.6251248751248751, "grad_norm": 1.792468547821045, "learning_rate": 3.8253733625567055e-05, "loss": 1.0403, "step": 12515 }, { "epoch": 0.6253746253746254, "grad_norm": 1.8191008567810059, "learning_rate": 3.822824812681584e-05, "loss": 1.0413, "step": 12520 }, { "epoch": 0.6256243756243757, "grad_norm": 1.8269435167312622, "learning_rate": 3.820276262806463e-05, "loss": 1.0706, "step": 12525 }, { "epoch": 0.6258741258741258, "grad_norm": 1.884639024734497, "learning_rate": 3.817727712931342e-05, "loss": 1.0335, "step": 12530 }, { "epoch": 0.6261238761238761, "grad_norm": 1.7573273181915283, "learning_rate": 3.815179163056221e-05, "loss": 1.0499, "step": 12535 }, { "epoch": 0.6263736263736264, "grad_norm": 1.8695167303085327, "learning_rate": 3.8126306131811e-05, "loss": 0.9661, "step": 12540 }, { "epoch": 0.6266233766233766, "grad_norm": 2.273432731628418, "learning_rate": 3.8100820633059795e-05, "loss": 1.008, "step": 12545 }, { "epoch": 0.6268731268731269, "grad_norm": 1.708207607269287, "learning_rate": 3.807533513430858e-05, "loss": 0.9841, "step": 12550 }, { "epoch": 0.6271228771228772, "grad_norm": 2.6524040699005127, "learning_rate": 3.804984963555737e-05, "loss": 1.0428, "step": 12555 }, { "epoch": 0.6273726273726273, "grad_norm": 1.888733983039856, "learning_rate": 3.802436413680616e-05, "loss": 0.9648, "step": 12560 }, { "epoch": 0.6276223776223776, "grad_norm": 2.30357027053833, "learning_rate": 3.799887863805495e-05, "loss": 1.025, "step": 12565 }, { "epoch": 0.6278721278721279, "grad_norm": 2.8197758197784424, "learning_rate": 3.7973393139303735e-05, "loss": 0.9775, "step": 12570 }, { "epoch": 0.6281218781218781, "grad_norm": 1.7924872636795044, "learning_rate": 3.794790764055253e-05, "loss": 1.003, "step": 12575 }, { "epoch": 0.6283716283716284, "grad_norm": 2.037564277648926, "learning_rate": 3.792242214180132e-05, "loss": 0.9485, "step": 12580 }, { "epoch": 0.6286213786213786, "grad_norm": 2.7847962379455566, "learning_rate": 3.7896936643050105e-05, "loss": 0.9958, "step": 12585 }, { "epoch": 0.6288711288711288, "grad_norm": 1.839518427848816, "learning_rate": 3.78714511442989e-05, "loss": 1.0506, "step": 12590 }, { "epoch": 0.6291208791208791, "grad_norm": 1.6577616930007935, "learning_rate": 3.784596564554769e-05, "loss": 0.887, "step": 12595 }, { "epoch": 0.6293706293706294, "grad_norm": 2.0470449924468994, "learning_rate": 3.7820480146796475e-05, "loss": 1.0359, "step": 12600 }, { "epoch": 0.6296203796203796, "grad_norm": 1.9067198038101196, "learning_rate": 3.779499464804526e-05, "loss": 1.1631, "step": 12605 }, { "epoch": 0.6298701298701299, "grad_norm": 2.0465943813323975, "learning_rate": 3.776950914929405e-05, "loss": 1.0447, "step": 12610 }, { "epoch": 0.6301198801198801, "grad_norm": 1.563302755355835, "learning_rate": 3.7744023650542845e-05, "loss": 1.0221, "step": 12615 }, { "epoch": 0.6303696303696303, "grad_norm": 1.7037928104400635, "learning_rate": 3.771853815179163e-05, "loss": 0.9916, "step": 12620 }, { "epoch": 0.6306193806193806, "grad_norm": 2.092188596725464, "learning_rate": 3.769305265304042e-05, "loss": 0.9928, "step": 12625 }, { "epoch": 0.6308691308691309, "grad_norm": 2.0303196907043457, "learning_rate": 3.7667567154289214e-05, "loss": 1.0307, "step": 12630 }, { "epoch": 0.6311188811188811, "grad_norm": 1.5680900812149048, "learning_rate": 3.7642081655538e-05, "loss": 1.1636, "step": 12635 }, { "epoch": 0.6313686313686314, "grad_norm": 1.6437342166900635, "learning_rate": 3.7616596156786785e-05, "loss": 0.9703, "step": 12640 }, { "epoch": 0.6316183816183816, "grad_norm": 3.5905628204345703, "learning_rate": 3.759111065803558e-05, "loss": 1.0431, "step": 12645 }, { "epoch": 0.6318681318681318, "grad_norm": 2.245359182357788, "learning_rate": 3.756562515928437e-05, "loss": 1.0037, "step": 12650 }, { "epoch": 0.6321178821178821, "grad_norm": 2.0063183307647705, "learning_rate": 3.7540139660533155e-05, "loss": 0.9791, "step": 12655 }, { "epoch": 0.6323676323676324, "grad_norm": 2.239750623703003, "learning_rate": 3.751465416178195e-05, "loss": 1.1021, "step": 12660 }, { "epoch": 0.6326173826173827, "grad_norm": 1.8606270551681519, "learning_rate": 3.748916866303074e-05, "loss": 1.0303, "step": 12665 }, { "epoch": 0.6328671328671329, "grad_norm": 1.9843782186508179, "learning_rate": 3.746368316427953e-05, "loss": 1.036, "step": 12670 }, { "epoch": 0.6331168831168831, "grad_norm": 3.063251256942749, "learning_rate": 3.743819766552832e-05, "loss": 1.025, "step": 12675 }, { "epoch": 0.6333666333666333, "grad_norm": 2.6186885833740234, "learning_rate": 3.74127121667771e-05, "loss": 1.135, "step": 12680 }, { "epoch": 0.6336163836163836, "grad_norm": 4.330862998962402, "learning_rate": 3.7387226668025894e-05, "loss": 1.014, "step": 12685 }, { "epoch": 0.6338661338661339, "grad_norm": 1.599074363708496, "learning_rate": 3.736174116927468e-05, "loss": 0.9331, "step": 12690 }, { "epoch": 0.6341158841158842, "grad_norm": 1.6180766820907593, "learning_rate": 3.733625567052347e-05, "loss": 1.0828, "step": 12695 }, { "epoch": 0.6343656343656343, "grad_norm": 2.1217241287231445, "learning_rate": 3.7310770171772264e-05, "loss": 1.0035, "step": 12700 }, { "epoch": 0.6346153846153846, "grad_norm": 1.9831637144088745, "learning_rate": 3.7285284673021056e-05, "loss": 0.9512, "step": 12705 }, { "epoch": 0.6348651348651349, "grad_norm": 2.1465868949890137, "learning_rate": 3.725979917426984e-05, "loss": 1.0139, "step": 12710 }, { "epoch": 0.6351148851148851, "grad_norm": 1.8616533279418945, "learning_rate": 3.7234313675518634e-05, "loss": 1.0487, "step": 12715 }, { "epoch": 0.6353646353646354, "grad_norm": 1.3975887298583984, "learning_rate": 3.720882817676742e-05, "loss": 0.9551, "step": 12720 }, { "epoch": 0.6356143856143857, "grad_norm": 1.8468170166015625, "learning_rate": 3.718334267801621e-05, "loss": 1.0068, "step": 12725 }, { "epoch": 0.6358641358641358, "grad_norm": 1.798540711402893, "learning_rate": 3.7157857179265e-05, "loss": 0.9978, "step": 12730 }, { "epoch": 0.6361138861138861, "grad_norm": 1.6924251317977905, "learning_rate": 3.713237168051379e-05, "loss": 1.028, "step": 12735 }, { "epoch": 0.6363636363636364, "grad_norm": 2.2600436210632324, "learning_rate": 3.710688618176258e-05, "loss": 0.8648, "step": 12740 }, { "epoch": 0.6366133866133866, "grad_norm": 2.3213465213775635, "learning_rate": 3.708140068301137e-05, "loss": 1.0107, "step": 12745 }, { "epoch": 0.6368631368631369, "grad_norm": 2.0415773391723633, "learning_rate": 3.705591518426016e-05, "loss": 1.0473, "step": 12750 }, { "epoch": 0.6371128871128872, "grad_norm": 1.8364949226379395, "learning_rate": 3.703042968550895e-05, "loss": 1.0651, "step": 12755 }, { "epoch": 0.6373626373626373, "grad_norm": 1.6487531661987305, "learning_rate": 3.7004944186757737e-05, "loss": 1.0258, "step": 12760 }, { "epoch": 0.6376123876123876, "grad_norm": 1.7383546829223633, "learning_rate": 3.697945868800652e-05, "loss": 1.1282, "step": 12765 }, { "epoch": 0.6378621378621379, "grad_norm": 2.2675228118896484, "learning_rate": 3.6953973189255314e-05, "loss": 1.0829, "step": 12770 }, { "epoch": 0.6381118881118881, "grad_norm": 2.4249300956726074, "learning_rate": 3.6928487690504106e-05, "loss": 1.0798, "step": 12775 }, { "epoch": 0.6383616383616384, "grad_norm": 2.1403698921203613, "learning_rate": 3.690300219175289e-05, "loss": 1.0704, "step": 12780 }, { "epoch": 0.6386113886113887, "grad_norm": 1.7458616495132446, "learning_rate": 3.6877516693001684e-05, "loss": 1.1026, "step": 12785 }, { "epoch": 0.6388611388611388, "grad_norm": 2.441042900085449, "learning_rate": 3.6852031194250476e-05, "loss": 0.943, "step": 12790 }, { "epoch": 0.6391108891108891, "grad_norm": 2.2159359455108643, "learning_rate": 3.682654569549927e-05, "loss": 0.9638, "step": 12795 }, { "epoch": 0.6393606393606394, "grad_norm": 1.6819944381713867, "learning_rate": 3.680106019674805e-05, "loss": 0.9922, "step": 12800 }, { "epoch": 0.6396103896103896, "grad_norm": 1.8609633445739746, "learning_rate": 3.677557469799684e-05, "loss": 1.0195, "step": 12805 }, { "epoch": 0.6398601398601399, "grad_norm": 1.5901659727096558, "learning_rate": 3.675008919924563e-05, "loss": 0.9816, "step": 12810 }, { "epoch": 0.6401098901098901, "grad_norm": 1.8275107145309448, "learning_rate": 3.672460370049442e-05, "loss": 1.1164, "step": 12815 }, { "epoch": 0.6403596403596403, "grad_norm": 1.8035597801208496, "learning_rate": 3.669911820174321e-05, "loss": 1.1594, "step": 12820 }, { "epoch": 0.6406093906093906, "grad_norm": 3.205132484436035, "learning_rate": 3.6673632702992e-05, "loss": 0.9706, "step": 12825 }, { "epoch": 0.6408591408591409, "grad_norm": 1.8314937353134155, "learning_rate": 3.664814720424079e-05, "loss": 1.0211, "step": 12830 }, { "epoch": 0.6411088911088911, "grad_norm": 2.348012685775757, "learning_rate": 3.662266170548958e-05, "loss": 0.9998, "step": 12835 }, { "epoch": 0.6413586413586414, "grad_norm": 1.706289291381836, "learning_rate": 3.6597176206738364e-05, "loss": 1.007, "step": 12840 }, { "epoch": 0.6416083916083916, "grad_norm": 1.6031274795532227, "learning_rate": 3.6571690707987156e-05, "loss": 1.0281, "step": 12845 }, { "epoch": 0.6418581418581418, "grad_norm": 1.9460443258285522, "learning_rate": 3.654620520923595e-05, "loss": 1.1265, "step": 12850 }, { "epoch": 0.6421078921078921, "grad_norm": 1.8056995868682861, "learning_rate": 3.6520719710484734e-05, "loss": 1.0114, "step": 12855 }, { "epoch": 0.6423576423576424, "grad_norm": 2.3449981212615967, "learning_rate": 3.6495234211733526e-05, "loss": 1.0295, "step": 12860 }, { "epoch": 0.6426073926073926, "grad_norm": 2.0823862552642822, "learning_rate": 3.646974871298232e-05, "loss": 1.0158, "step": 12865 }, { "epoch": 0.6428571428571429, "grad_norm": 1.829966425895691, "learning_rate": 3.6444263214231104e-05, "loss": 1.0703, "step": 12870 }, { "epoch": 0.6431068931068931, "grad_norm": 1.620180368423462, "learning_rate": 3.6418777715479896e-05, "loss": 1.0748, "step": 12875 }, { "epoch": 0.6433566433566433, "grad_norm": 1.5299054384231567, "learning_rate": 3.639329221672868e-05, "loss": 1.0385, "step": 12880 }, { "epoch": 0.6436063936063936, "grad_norm": 1.6521722078323364, "learning_rate": 3.636780671797747e-05, "loss": 1.1093, "step": 12885 }, { "epoch": 0.6438561438561439, "grad_norm": 1.664203405380249, "learning_rate": 3.634232121922626e-05, "loss": 1.0036, "step": 12890 }, { "epoch": 0.6441058941058941, "grad_norm": 1.8002187013626099, "learning_rate": 3.631683572047505e-05, "loss": 0.9615, "step": 12895 }, { "epoch": 0.6443556443556444, "grad_norm": 1.727260708808899, "learning_rate": 3.629135022172384e-05, "loss": 1.0009, "step": 12900 }, { "epoch": 0.6446053946053946, "grad_norm": 1.9835728406906128, "learning_rate": 3.626586472297263e-05, "loss": 0.9877, "step": 12905 }, { "epoch": 0.6448551448551448, "grad_norm": 1.8225946426391602, "learning_rate": 3.624037922422142e-05, "loss": 0.9506, "step": 12910 }, { "epoch": 0.6451048951048951, "grad_norm": 1.7325910329818726, "learning_rate": 3.621489372547021e-05, "loss": 1.0536, "step": 12915 }, { "epoch": 0.6453546453546454, "grad_norm": 1.9990692138671875, "learning_rate": 3.6189408226719e-05, "loss": 1.0138, "step": 12920 }, { "epoch": 0.6456043956043956, "grad_norm": 2.200596332550049, "learning_rate": 3.6163922727967784e-05, "loss": 0.9154, "step": 12925 }, { "epoch": 0.6458541458541458, "grad_norm": 1.9858291149139404, "learning_rate": 3.6138437229216576e-05, "loss": 1.0302, "step": 12930 }, { "epoch": 0.6461038961038961, "grad_norm": 1.679103970527649, "learning_rate": 3.611295173046537e-05, "loss": 0.9381, "step": 12935 }, { "epoch": 0.6463536463536463, "grad_norm": 1.989579439163208, "learning_rate": 3.6087466231714153e-05, "loss": 1.0911, "step": 12940 }, { "epoch": 0.6466033966033966, "grad_norm": 2.188225269317627, "learning_rate": 3.6061980732962946e-05, "loss": 1.0099, "step": 12945 }, { "epoch": 0.6468531468531469, "grad_norm": 1.7135589122772217, "learning_rate": 3.603649523421174e-05, "loss": 0.9765, "step": 12950 }, { "epoch": 0.6471028971028971, "grad_norm": 3.2180862426757812, "learning_rate": 3.601100973546053e-05, "loss": 0.9991, "step": 12955 }, { "epoch": 0.6473526473526473, "grad_norm": 2.1181116104125977, "learning_rate": 3.598552423670931e-05, "loss": 1.0651, "step": 12960 }, { "epoch": 0.6476023976023976, "grad_norm": 2.030477523803711, "learning_rate": 3.59600387379581e-05, "loss": 0.9898, "step": 12965 }, { "epoch": 0.6478521478521478, "grad_norm": 1.6955183744430542, "learning_rate": 3.593455323920689e-05, "loss": 0.9913, "step": 12970 }, { "epoch": 0.6481018981018981, "grad_norm": 1.4614200592041016, "learning_rate": 3.5909067740455685e-05, "loss": 0.9958, "step": 12975 }, { "epoch": 0.6483516483516484, "grad_norm": 2.0886523723602295, "learning_rate": 3.588358224170447e-05, "loss": 0.9557, "step": 12980 }, { "epoch": 0.6486013986013986, "grad_norm": 1.5830283164978027, "learning_rate": 3.585809674295326e-05, "loss": 0.9966, "step": 12985 }, { "epoch": 0.6488511488511488, "grad_norm": 1.9637730121612549, "learning_rate": 3.5832611244202055e-05, "loss": 1.0373, "step": 12990 }, { "epoch": 0.6491008991008991, "grad_norm": 2.382225275039673, "learning_rate": 3.580712574545084e-05, "loss": 0.9586, "step": 12995 }, { "epoch": 0.6493506493506493, "grad_norm": 2.370285987854004, "learning_rate": 3.5781640246699626e-05, "loss": 0.957, "step": 13000 }, { "epoch": 0.6496003996003996, "grad_norm": 1.925628662109375, "learning_rate": 3.575615474794842e-05, "loss": 0.9856, "step": 13005 }, { "epoch": 0.6498501498501499, "grad_norm": 1.713376760482788, "learning_rate": 3.573066924919721e-05, "loss": 0.9488, "step": 13010 }, { "epoch": 0.6500999000999002, "grad_norm": 2.986056089401245, "learning_rate": 3.5705183750445996e-05, "loss": 1.1241, "step": 13015 }, { "epoch": 0.6503496503496503, "grad_norm": 1.8642010688781738, "learning_rate": 3.567969825169479e-05, "loss": 1.0642, "step": 13020 }, { "epoch": 0.6505994005994006, "grad_norm": 1.854083776473999, "learning_rate": 3.565421275294358e-05, "loss": 1.1017, "step": 13025 }, { "epoch": 0.6508491508491508, "grad_norm": 2.154189109802246, "learning_rate": 3.5628727254192365e-05, "loss": 0.9773, "step": 13030 }, { "epoch": 0.6510989010989011, "grad_norm": 1.8881289958953857, "learning_rate": 3.560324175544116e-05, "loss": 0.8758, "step": 13035 }, { "epoch": 0.6513486513486514, "grad_norm": 2.037369728088379, "learning_rate": 3.557775625668994e-05, "loss": 1.0354, "step": 13040 }, { "epoch": 0.6515984015984015, "grad_norm": 1.9851499795913696, "learning_rate": 3.5552270757938735e-05, "loss": 1.0293, "step": 13045 }, { "epoch": 0.6518481518481518, "grad_norm": 1.8943560123443604, "learning_rate": 3.552678525918752e-05, "loss": 1.0434, "step": 13050 }, { "epoch": 0.6520979020979021, "grad_norm": 1.9358736276626587, "learning_rate": 3.550129976043631e-05, "loss": 0.9704, "step": 13055 }, { "epoch": 0.6523476523476524, "grad_norm": 2.6822469234466553, "learning_rate": 3.5475814261685105e-05, "loss": 1.0518, "step": 13060 }, { "epoch": 0.6525974025974026, "grad_norm": 1.7299546003341675, "learning_rate": 3.545032876293389e-05, "loss": 1.0498, "step": 13065 }, { "epoch": 0.6528471528471529, "grad_norm": 1.795641303062439, "learning_rate": 3.542484326418268e-05, "loss": 1.1646, "step": 13070 }, { "epoch": 0.653096903096903, "grad_norm": 2.339137077331543, "learning_rate": 3.5399357765431475e-05, "loss": 1.072, "step": 13075 }, { "epoch": 0.6533466533466533, "grad_norm": 2.724149227142334, "learning_rate": 3.537387226668026e-05, "loss": 1.0024, "step": 13080 }, { "epoch": 0.6535964035964036, "grad_norm": 1.9744131565093994, "learning_rate": 3.5348386767929045e-05, "loss": 1.0511, "step": 13085 }, { "epoch": 0.6538461538461539, "grad_norm": 1.6079829931259155, "learning_rate": 3.532290126917784e-05, "loss": 0.9529, "step": 13090 }, { "epoch": 0.6540959040959041, "grad_norm": 1.7337836027145386, "learning_rate": 3.529741577042663e-05, "loss": 0.9357, "step": 13095 }, { "epoch": 0.6543456543456544, "grad_norm": 2.079403877258301, "learning_rate": 3.5271930271675415e-05, "loss": 1.0149, "step": 13100 }, { "epoch": 0.6545954045954046, "grad_norm": 1.658357858657837, "learning_rate": 3.524644477292421e-05, "loss": 1.031, "step": 13105 }, { "epoch": 0.6548451548451548, "grad_norm": 2.2486095428466797, "learning_rate": 3.5220959274173e-05, "loss": 0.9378, "step": 13110 }, { "epoch": 0.6550949050949051, "grad_norm": 1.71742844581604, "learning_rate": 3.519547377542179e-05, "loss": 0.9086, "step": 13115 }, { "epoch": 0.6553446553446554, "grad_norm": 2.2295031547546387, "learning_rate": 3.516998827667057e-05, "loss": 1.0288, "step": 13120 }, { "epoch": 0.6555944055944056, "grad_norm": 1.8170973062515259, "learning_rate": 3.514450277791936e-05, "loss": 1.011, "step": 13125 }, { "epoch": 0.6558441558441559, "grad_norm": 3.240468978881836, "learning_rate": 3.5119017279168155e-05, "loss": 0.9921, "step": 13130 }, { "epoch": 0.656093906093906, "grad_norm": 1.5531034469604492, "learning_rate": 3.509353178041695e-05, "loss": 1.0737, "step": 13135 }, { "epoch": 0.6563436563436563, "grad_norm": 2.1986045837402344, "learning_rate": 3.506804628166573e-05, "loss": 1.0666, "step": 13140 }, { "epoch": 0.6565934065934066, "grad_norm": 3.5087242126464844, "learning_rate": 3.5042560782914524e-05, "loss": 1.0084, "step": 13145 }, { "epoch": 0.6568431568431569, "grad_norm": 1.603033423423767, "learning_rate": 3.501707528416332e-05, "loss": 0.9567, "step": 13150 }, { "epoch": 0.6570929070929071, "grad_norm": 2.020007610321045, "learning_rate": 3.49915897854121e-05, "loss": 1.0452, "step": 13155 }, { "epoch": 0.6573426573426573, "grad_norm": 1.5878465175628662, "learning_rate": 3.496610428666089e-05, "loss": 1.0834, "step": 13160 }, { "epoch": 0.6575924075924076, "grad_norm": 1.6255543231964111, "learning_rate": 3.494061878790968e-05, "loss": 1.0354, "step": 13165 }, { "epoch": 0.6578421578421578, "grad_norm": 1.3828444480895996, "learning_rate": 3.491513328915847e-05, "loss": 1.0331, "step": 13170 }, { "epoch": 0.6580919080919081, "grad_norm": 2.016559600830078, "learning_rate": 3.488964779040726e-05, "loss": 1.0855, "step": 13175 }, { "epoch": 0.6583416583416584, "grad_norm": 1.8929166793823242, "learning_rate": 3.486416229165605e-05, "loss": 1.0663, "step": 13180 }, { "epoch": 0.6585914085914086, "grad_norm": 1.9147865772247314, "learning_rate": 3.483867679290484e-05, "loss": 1.0507, "step": 13185 }, { "epoch": 0.6588411588411588, "grad_norm": 2.0106256008148193, "learning_rate": 3.481319129415363e-05, "loss": 0.9778, "step": 13190 }, { "epoch": 0.6590909090909091, "grad_norm": 1.7128139734268188, "learning_rate": 3.478770579540242e-05, "loss": 1.0283, "step": 13195 }, { "epoch": 0.6593406593406593, "grad_norm": 1.5802463293075562, "learning_rate": 3.4762220296651205e-05, "loss": 1.0814, "step": 13200 }, { "epoch": 0.6595904095904096, "grad_norm": 1.9792945384979248, "learning_rate": 3.47367347979e-05, "loss": 1.0168, "step": 13205 }, { "epoch": 0.6598401598401599, "grad_norm": 2.9029176235198975, "learning_rate": 3.471124929914878e-05, "loss": 1.0767, "step": 13210 }, { "epoch": 0.6600899100899101, "grad_norm": 1.9213687181472778, "learning_rate": 3.4685763800397574e-05, "loss": 1.1497, "step": 13215 }, { "epoch": 0.6603396603396603, "grad_norm": 2.3208796977996826, "learning_rate": 3.4660278301646367e-05, "loss": 1.0252, "step": 13220 }, { "epoch": 0.6605894105894106, "grad_norm": 1.862512230873108, "learning_rate": 3.463479280289515e-05, "loss": 1.0246, "step": 13225 }, { "epoch": 0.6608391608391608, "grad_norm": 1.8942519426345825, "learning_rate": 3.4609307304143944e-05, "loss": 1.043, "step": 13230 }, { "epoch": 0.6610889110889111, "grad_norm": 1.732455849647522, "learning_rate": 3.4583821805392736e-05, "loss": 1.0287, "step": 13235 }, { "epoch": 0.6613386613386614, "grad_norm": 1.7875592708587646, "learning_rate": 3.455833630664152e-05, "loss": 0.933, "step": 13240 }, { "epoch": 0.6615884115884116, "grad_norm": 3.206570863723755, "learning_rate": 3.453285080789031e-05, "loss": 1.1405, "step": 13245 }, { "epoch": 0.6618381618381618, "grad_norm": 2.622054100036621, "learning_rate": 3.45073653091391e-05, "loss": 1.0112, "step": 13250 }, { "epoch": 0.6620879120879121, "grad_norm": 1.8504068851470947, "learning_rate": 3.448187981038789e-05, "loss": 1.0637, "step": 13255 }, { "epoch": 0.6623376623376623, "grad_norm": 2.018211603164673, "learning_rate": 3.4456394311636684e-05, "loss": 0.9074, "step": 13260 }, { "epoch": 0.6625874125874126, "grad_norm": 2.2940590381622314, "learning_rate": 3.443090881288547e-05, "loss": 1.0218, "step": 13265 }, { "epoch": 0.6628371628371629, "grad_norm": 2.4388530254364014, "learning_rate": 3.440542331413426e-05, "loss": 1.1233, "step": 13270 }, { "epoch": 0.663086913086913, "grad_norm": 2.020291328430176, "learning_rate": 3.4379937815383053e-05, "loss": 0.9514, "step": 13275 }, { "epoch": 0.6633366633366633, "grad_norm": 2.1938488483428955, "learning_rate": 3.435445231663184e-05, "loss": 1.0151, "step": 13280 }, { "epoch": 0.6635864135864136, "grad_norm": 1.4632251262664795, "learning_rate": 3.4328966817880624e-05, "loss": 0.9541, "step": 13285 }, { "epoch": 0.6638361638361638, "grad_norm": 1.714279294013977, "learning_rate": 3.4303481319129416e-05, "loss": 1.0474, "step": 13290 }, { "epoch": 0.6640859140859141, "grad_norm": 2.371450185775757, "learning_rate": 3.427799582037821e-05, "loss": 0.9634, "step": 13295 }, { "epoch": 0.6643356643356644, "grad_norm": 1.9237555265426636, "learning_rate": 3.4252510321626994e-05, "loss": 0.9302, "step": 13300 }, { "epoch": 0.6645854145854145, "grad_norm": 2.0708329677581787, "learning_rate": 3.4227024822875786e-05, "loss": 0.977, "step": 13305 }, { "epoch": 0.6648351648351648, "grad_norm": 1.5975594520568848, "learning_rate": 3.420153932412458e-05, "loss": 1.0096, "step": 13310 }, { "epoch": 0.6650849150849151, "grad_norm": 1.8244414329528809, "learning_rate": 3.4176053825373364e-05, "loss": 1.029, "step": 13315 }, { "epoch": 0.6653346653346653, "grad_norm": 1.6947301626205444, "learning_rate": 3.415056832662215e-05, "loss": 0.9993, "step": 13320 }, { "epoch": 0.6655844155844156, "grad_norm": 1.9795631170272827, "learning_rate": 3.412508282787094e-05, "loss": 0.9984, "step": 13325 }, { "epoch": 0.6658341658341659, "grad_norm": 1.9444886445999146, "learning_rate": 3.4099597329119734e-05, "loss": 0.925, "step": 13330 }, { "epoch": 0.666083916083916, "grad_norm": 1.9046074151992798, "learning_rate": 3.407411183036852e-05, "loss": 0.9736, "step": 13335 }, { "epoch": 0.6663336663336663, "grad_norm": 2.6505818367004395, "learning_rate": 3.404862633161731e-05, "loss": 1.0205, "step": 13340 }, { "epoch": 0.6665834165834166, "grad_norm": 1.758602499961853, "learning_rate": 3.40231408328661e-05, "loss": 1.0972, "step": 13345 }, { "epoch": 0.6668331668331668, "grad_norm": 1.85264253616333, "learning_rate": 3.399765533411489e-05, "loss": 1.1142, "step": 13350 }, { "epoch": 0.6670829170829171, "grad_norm": 2.7023532390594482, "learning_rate": 3.397216983536368e-05, "loss": 0.9815, "step": 13355 }, { "epoch": 0.6673326673326674, "grad_norm": 2.068092107772827, "learning_rate": 3.3946684336612466e-05, "loss": 1.0503, "step": 13360 }, { "epoch": 0.6675824175824175, "grad_norm": 1.7430241107940674, "learning_rate": 3.392119883786126e-05, "loss": 0.9278, "step": 13365 }, { "epoch": 0.6678321678321678, "grad_norm": 1.908068060874939, "learning_rate": 3.3895713339110044e-05, "loss": 0.996, "step": 13370 }, { "epoch": 0.6680819180819181, "grad_norm": 2.383435010910034, "learning_rate": 3.3870227840358836e-05, "loss": 1.0184, "step": 13375 }, { "epoch": 0.6683316683316683, "grad_norm": 2.0431487560272217, "learning_rate": 3.384474234160763e-05, "loss": 1.0667, "step": 13380 }, { "epoch": 0.6685814185814186, "grad_norm": 2.054065465927124, "learning_rate": 3.381925684285642e-05, "loss": 0.9956, "step": 13385 }, { "epoch": 0.6688311688311688, "grad_norm": 2.009727954864502, "learning_rate": 3.3793771344105206e-05, "loss": 1.0249, "step": 13390 }, { "epoch": 0.669080919080919, "grad_norm": 1.5411505699157715, "learning_rate": 3.3768285845354e-05, "loss": 1.063, "step": 13395 }, { "epoch": 0.6693306693306693, "grad_norm": 2.148240327835083, "learning_rate": 3.3742800346602783e-05, "loss": 1.0211, "step": 13400 }, { "epoch": 0.6695804195804196, "grad_norm": 1.963493824005127, "learning_rate": 3.371731484785157e-05, "loss": 0.9708, "step": 13405 }, { "epoch": 0.6698301698301699, "grad_norm": 2.072082042694092, "learning_rate": 3.369182934910036e-05, "loss": 1.0034, "step": 13410 }, { "epoch": 0.6700799200799201, "grad_norm": 2.204357624053955, "learning_rate": 3.366634385034915e-05, "loss": 1.1021, "step": 13415 }, { "epoch": 0.6703296703296703, "grad_norm": 2.0721652507781982, "learning_rate": 3.3640858351597945e-05, "loss": 0.9986, "step": 13420 }, { "epoch": 0.6705794205794205, "grad_norm": 1.485994577407837, "learning_rate": 3.361537285284673e-05, "loss": 1.0542, "step": 13425 }, { "epoch": 0.6708291708291708, "grad_norm": 2.3236494064331055, "learning_rate": 3.358988735409552e-05, "loss": 0.8827, "step": 13430 }, { "epoch": 0.6710789210789211, "grad_norm": 2.102216958999634, "learning_rate": 3.3564401855344315e-05, "loss": 1.1147, "step": 13435 }, { "epoch": 0.6713286713286714, "grad_norm": 1.782116413116455, "learning_rate": 3.35389163565931e-05, "loss": 1.0363, "step": 13440 }, { "epoch": 0.6715784215784216, "grad_norm": 1.7523294687271118, "learning_rate": 3.3513430857841886e-05, "loss": 1.1081, "step": 13445 }, { "epoch": 0.6718281718281718, "grad_norm": 1.8718173503875732, "learning_rate": 3.348794535909068e-05, "loss": 1.0047, "step": 13450 }, { "epoch": 0.672077922077922, "grad_norm": 1.4131758213043213, "learning_rate": 3.346245986033947e-05, "loss": 0.9835, "step": 13455 }, { "epoch": 0.6723276723276723, "grad_norm": 1.8383727073669434, "learning_rate": 3.3436974361588256e-05, "loss": 0.9183, "step": 13460 }, { "epoch": 0.6725774225774226, "grad_norm": 2.191143035888672, "learning_rate": 3.341148886283705e-05, "loss": 1.0862, "step": 13465 }, { "epoch": 0.6728271728271729, "grad_norm": 2.3951005935668945, "learning_rate": 3.338600336408584e-05, "loss": 1.0836, "step": 13470 }, { "epoch": 0.6730769230769231, "grad_norm": 1.502684235572815, "learning_rate": 3.3360517865334626e-05, "loss": 1.0269, "step": 13475 }, { "epoch": 0.6733266733266733, "grad_norm": 2.1650617122650146, "learning_rate": 3.333503236658341e-05, "loss": 1.0755, "step": 13480 }, { "epoch": 0.6735764235764236, "grad_norm": 2.1061480045318604, "learning_rate": 3.33095468678322e-05, "loss": 0.926, "step": 13485 }, { "epoch": 0.6738261738261738, "grad_norm": 2.0809988975524902, "learning_rate": 3.3284061369080995e-05, "loss": 1.0311, "step": 13490 }, { "epoch": 0.6740759240759241, "grad_norm": 1.9295034408569336, "learning_rate": 3.325857587032978e-05, "loss": 1.1247, "step": 13495 }, { "epoch": 0.6743256743256744, "grad_norm": 2.0534167289733887, "learning_rate": 3.323309037157857e-05, "loss": 1.0477, "step": 13500 }, { "epoch": 0.6745754245754245, "grad_norm": 1.9253385066986084, "learning_rate": 3.3207604872827365e-05, "loss": 1.0973, "step": 13505 }, { "epoch": 0.6748251748251748, "grad_norm": 1.728393316268921, "learning_rate": 3.318211937407615e-05, "loss": 0.989, "step": 13510 }, { "epoch": 0.6750749250749251, "grad_norm": 1.8977981805801392, "learning_rate": 3.315663387532494e-05, "loss": 1.0143, "step": 13515 }, { "epoch": 0.6753246753246753, "grad_norm": 1.6893837451934814, "learning_rate": 3.313114837657373e-05, "loss": 0.9371, "step": 13520 }, { "epoch": 0.6755744255744256, "grad_norm": 2.4866793155670166, "learning_rate": 3.310566287782252e-05, "loss": 1.0675, "step": 13525 }, { "epoch": 0.6758241758241759, "grad_norm": 3.3266170024871826, "learning_rate": 3.3080177379071306e-05, "loss": 1.0209, "step": 13530 }, { "epoch": 0.676073926073926, "grad_norm": 2.297790288925171, "learning_rate": 3.30546918803201e-05, "loss": 0.9105, "step": 13535 }, { "epoch": 0.6763236763236763, "grad_norm": 2.1635444164276123, "learning_rate": 3.302920638156889e-05, "loss": 1.0455, "step": 13540 }, { "epoch": 0.6765734265734266, "grad_norm": 2.4346725940704346, "learning_rate": 3.300372088281768e-05, "loss": 0.9774, "step": 13545 }, { "epoch": 0.6768231768231768, "grad_norm": 1.7222219705581665, "learning_rate": 3.297823538406647e-05, "loss": 1.1532, "step": 13550 }, { "epoch": 0.6770729270729271, "grad_norm": 1.8412479162216187, "learning_rate": 3.295274988531526e-05, "loss": 0.9889, "step": 13555 }, { "epoch": 0.6773226773226774, "grad_norm": 1.94744074344635, "learning_rate": 3.2927264386564045e-05, "loss": 1.057, "step": 13560 }, { "epoch": 0.6775724275724275, "grad_norm": 1.9805406332015991, "learning_rate": 3.290177888781284e-05, "loss": 0.9976, "step": 13565 }, { "epoch": 0.6778221778221778, "grad_norm": 2.6975035667419434, "learning_rate": 3.287629338906162e-05, "loss": 1.0065, "step": 13570 }, { "epoch": 0.6780719280719281, "grad_norm": 1.8370251655578613, "learning_rate": 3.2850807890310415e-05, "loss": 1.0143, "step": 13575 }, { "epoch": 0.6783216783216783, "grad_norm": 1.9312330484390259, "learning_rate": 3.282532239155921e-05, "loss": 1.017, "step": 13580 }, { "epoch": 0.6785714285714286, "grad_norm": 1.6671421527862549, "learning_rate": 3.279983689280799e-05, "loss": 0.9941, "step": 13585 }, { "epoch": 0.6788211788211789, "grad_norm": 1.9915934801101685, "learning_rate": 3.2774351394056785e-05, "loss": 1.0233, "step": 13590 }, { "epoch": 0.679070929070929, "grad_norm": 2.0020735263824463, "learning_rate": 3.274886589530558e-05, "loss": 1.0185, "step": 13595 }, { "epoch": 0.6793206793206793, "grad_norm": 1.9467239379882812, "learning_rate": 3.272338039655436e-05, "loss": 0.9756, "step": 13600 }, { "epoch": 0.6795704295704296, "grad_norm": 2.187821388244629, "learning_rate": 3.269789489780315e-05, "loss": 0.9709, "step": 13605 }, { "epoch": 0.6798201798201798, "grad_norm": 1.683494210243225, "learning_rate": 3.267240939905194e-05, "loss": 1.0479, "step": 13610 }, { "epoch": 0.6800699300699301, "grad_norm": 1.881611704826355, "learning_rate": 3.264692390030073e-05, "loss": 0.9988, "step": 13615 }, { "epoch": 0.6803196803196803, "grad_norm": 2.112809419631958, "learning_rate": 3.262143840154952e-05, "loss": 1.0712, "step": 13620 }, { "epoch": 0.6805694305694305, "grad_norm": 2.7120745182037354, "learning_rate": 3.259595290279831e-05, "loss": 0.988, "step": 13625 }, { "epoch": 0.6808191808191808, "grad_norm": 1.7600488662719727, "learning_rate": 3.25704674040471e-05, "loss": 1.0864, "step": 13630 }, { "epoch": 0.6810689310689311, "grad_norm": 1.8273953199386597, "learning_rate": 3.254498190529589e-05, "loss": 1.0603, "step": 13635 }, { "epoch": 0.6813186813186813, "grad_norm": 2.104353189468384, "learning_rate": 3.251949640654467e-05, "loss": 1.1155, "step": 13640 }, { "epoch": 0.6815684315684316, "grad_norm": 1.6671171188354492, "learning_rate": 3.2494010907793465e-05, "loss": 1.0011, "step": 13645 }, { "epoch": 0.6818181818181818, "grad_norm": 2.2257213592529297, "learning_rate": 3.246852540904226e-05, "loss": 1.0363, "step": 13650 }, { "epoch": 0.682067932067932, "grad_norm": 1.9039008617401123, "learning_rate": 3.244303991029104e-05, "loss": 1.0894, "step": 13655 }, { "epoch": 0.6823176823176823, "grad_norm": 2.271014928817749, "learning_rate": 3.2417554411539835e-05, "loss": 1.0048, "step": 13660 }, { "epoch": 0.6825674325674326, "grad_norm": 1.5042061805725098, "learning_rate": 3.239206891278863e-05, "loss": 1.0159, "step": 13665 }, { "epoch": 0.6828171828171828, "grad_norm": 1.743938684463501, "learning_rate": 3.236658341403742e-05, "loss": 1.0149, "step": 13670 }, { "epoch": 0.6830669330669331, "grad_norm": 1.5967261791229248, "learning_rate": 3.2341097915286204e-05, "loss": 0.9977, "step": 13675 }, { "epoch": 0.6833166833166833, "grad_norm": 1.9162873029708862, "learning_rate": 3.231561241653499e-05, "loss": 0.9785, "step": 13680 }, { "epoch": 0.6835664335664335, "grad_norm": 1.538184642791748, "learning_rate": 3.229012691778378e-05, "loss": 0.9378, "step": 13685 }, { "epoch": 0.6838161838161838, "grad_norm": 1.4293063879013062, "learning_rate": 3.2264641419032574e-05, "loss": 1.0297, "step": 13690 }, { "epoch": 0.6840659340659341, "grad_norm": 1.9641973972320557, "learning_rate": 3.223915592028136e-05, "loss": 0.9066, "step": 13695 }, { "epoch": 0.6843156843156843, "grad_norm": 2.927401542663574, "learning_rate": 3.221367042153015e-05, "loss": 0.9642, "step": 13700 }, { "epoch": 0.6845654345654346, "grad_norm": 1.597291111946106, "learning_rate": 3.2188184922778944e-05, "loss": 1.0809, "step": 13705 }, { "epoch": 0.6848151848151848, "grad_norm": 2.410902500152588, "learning_rate": 3.216269942402773e-05, "loss": 1.0223, "step": 13710 }, { "epoch": 0.685064935064935, "grad_norm": 1.6840180158615112, "learning_rate": 3.213721392527652e-05, "loss": 1.1301, "step": 13715 }, { "epoch": 0.6853146853146853, "grad_norm": 1.6859220266342163, "learning_rate": 3.211172842652531e-05, "loss": 0.9973, "step": 13720 }, { "epoch": 0.6855644355644356, "grad_norm": 1.6346023082733154, "learning_rate": 3.20862429277741e-05, "loss": 1.0026, "step": 13725 }, { "epoch": 0.6858141858141859, "grad_norm": 1.6695895195007324, "learning_rate": 3.2060757429022884e-05, "loss": 0.989, "step": 13730 }, { "epoch": 0.686063936063936, "grad_norm": 1.582090973854065, "learning_rate": 3.203527193027168e-05, "loss": 1.0706, "step": 13735 }, { "epoch": 0.6863136863136863, "grad_norm": 1.6633132696151733, "learning_rate": 3.200978643152047e-05, "loss": 1.0287, "step": 13740 }, { "epoch": 0.6865634365634365, "grad_norm": 2.138124942779541, "learning_rate": 3.1984300932769254e-05, "loss": 0.995, "step": 13745 }, { "epoch": 0.6868131868131868, "grad_norm": 2.612031936645508, "learning_rate": 3.1958815434018046e-05, "loss": 0.988, "step": 13750 }, { "epoch": 0.6870629370629371, "grad_norm": 1.9808517694473267, "learning_rate": 3.193332993526684e-05, "loss": 1.006, "step": 13755 }, { "epoch": 0.6873126873126874, "grad_norm": 1.8949146270751953, "learning_rate": 3.1907844436515624e-05, "loss": 1.0915, "step": 13760 }, { "epoch": 0.6875624375624375, "grad_norm": 1.5713690519332886, "learning_rate": 3.188235893776441e-05, "loss": 1.0281, "step": 13765 }, { "epoch": 0.6878121878121878, "grad_norm": 1.8597514629364014, "learning_rate": 3.18568734390132e-05, "loss": 1.1246, "step": 13770 }, { "epoch": 0.688061938061938, "grad_norm": 2.226571559906006, "learning_rate": 3.1831387940261994e-05, "loss": 0.9951, "step": 13775 }, { "epoch": 0.6883116883116883, "grad_norm": 1.5547319650650024, "learning_rate": 3.180590244151078e-05, "loss": 1.0864, "step": 13780 }, { "epoch": 0.6885614385614386, "grad_norm": 1.917811393737793, "learning_rate": 3.178041694275957e-05, "loss": 1.0029, "step": 13785 }, { "epoch": 0.6888111888111889, "grad_norm": 2.0627245903015137, "learning_rate": 3.1754931444008364e-05, "loss": 1.1595, "step": 13790 }, { "epoch": 0.689060939060939, "grad_norm": 1.7212413549423218, "learning_rate": 3.1729445945257156e-05, "loss": 1.1031, "step": 13795 }, { "epoch": 0.6893106893106893, "grad_norm": 1.6637972593307495, "learning_rate": 3.1703960446505934e-05, "loss": 0.956, "step": 13800 }, { "epoch": 0.6895604395604396, "grad_norm": 1.953180193901062, "learning_rate": 3.1678474947754727e-05, "loss": 1.0324, "step": 13805 }, { "epoch": 0.6898101898101898, "grad_norm": 2.302455186843872, "learning_rate": 3.165298944900352e-05, "loss": 0.9963, "step": 13810 }, { "epoch": 0.6900599400599401, "grad_norm": 1.9804182052612305, "learning_rate": 3.1627503950252304e-05, "loss": 1.0328, "step": 13815 }, { "epoch": 0.6903096903096904, "grad_norm": 2.2058234214782715, "learning_rate": 3.1602018451501096e-05, "loss": 0.9301, "step": 13820 }, { "epoch": 0.6905594405594405, "grad_norm": 1.8477270603179932, "learning_rate": 3.157653295274989e-05, "loss": 1.0697, "step": 13825 }, { "epoch": 0.6908091908091908, "grad_norm": 1.6500221490859985, "learning_rate": 3.155104745399868e-05, "loss": 1.032, "step": 13830 }, { "epoch": 0.6910589410589411, "grad_norm": 1.7890392541885376, "learning_rate": 3.1525561955247466e-05, "loss": 1.0435, "step": 13835 }, { "epoch": 0.6913086913086913, "grad_norm": 1.8008594512939453, "learning_rate": 3.150007645649625e-05, "loss": 1.0256, "step": 13840 }, { "epoch": 0.6915584415584416, "grad_norm": 1.789910912513733, "learning_rate": 3.1474590957745044e-05, "loss": 1.029, "step": 13845 }, { "epoch": 0.6918081918081919, "grad_norm": 1.733893632888794, "learning_rate": 3.1449105458993836e-05, "loss": 0.9846, "step": 13850 }, { "epoch": 0.692057942057942, "grad_norm": 1.569549560546875, "learning_rate": 3.142361996024262e-05, "loss": 1.0603, "step": 13855 }, { "epoch": 0.6923076923076923, "grad_norm": 1.6041643619537354, "learning_rate": 3.1398134461491413e-05, "loss": 0.9208, "step": 13860 }, { "epoch": 0.6925574425574426, "grad_norm": 1.7089136838912964, "learning_rate": 3.1372648962740206e-05, "loss": 1.0753, "step": 13865 }, { "epoch": 0.6928071928071928, "grad_norm": 2.1899406909942627, "learning_rate": 3.134716346398899e-05, "loss": 1.0527, "step": 13870 }, { "epoch": 0.6930569430569431, "grad_norm": 1.9127686023712158, "learning_rate": 3.132167796523778e-05, "loss": 1.0045, "step": 13875 }, { "epoch": 0.6933066933066933, "grad_norm": 1.830481767654419, "learning_rate": 3.129619246648657e-05, "loss": 0.9996, "step": 13880 }, { "epoch": 0.6935564435564435, "grad_norm": 1.3739105463027954, "learning_rate": 3.127070696773536e-05, "loss": 1.065, "step": 13885 }, { "epoch": 0.6938061938061938, "grad_norm": 1.6658601760864258, "learning_rate": 3.1245221468984146e-05, "loss": 1.1034, "step": 13890 }, { "epoch": 0.6940559440559441, "grad_norm": 1.810303807258606, "learning_rate": 3.121973597023294e-05, "loss": 0.9439, "step": 13895 }, { "epoch": 0.6943056943056943, "grad_norm": 2.1417460441589355, "learning_rate": 3.119425047148173e-05, "loss": 1.0655, "step": 13900 }, { "epoch": 0.6945554445554446, "grad_norm": 1.7808958292007446, "learning_rate": 3.1168764972730516e-05, "loss": 0.9908, "step": 13905 }, { "epoch": 0.6948051948051948, "grad_norm": 2.047415256500244, "learning_rate": 3.114327947397931e-05, "loss": 1.0104, "step": 13910 }, { "epoch": 0.695054945054945, "grad_norm": 2.1172993183135986, "learning_rate": 3.11177939752281e-05, "loss": 0.9782, "step": 13915 }, { "epoch": 0.6953046953046953, "grad_norm": 1.821087121963501, "learning_rate": 3.1092308476476886e-05, "loss": 1.0089, "step": 13920 }, { "epoch": 0.6955544455544456, "grad_norm": 2.4070394039154053, "learning_rate": 3.106682297772567e-05, "loss": 1.026, "step": 13925 }, { "epoch": 0.6958041958041958, "grad_norm": 2.2822370529174805, "learning_rate": 3.104133747897446e-05, "loss": 1.1174, "step": 13930 }, { "epoch": 0.6960539460539461, "grad_norm": 1.6279808282852173, "learning_rate": 3.1015851980223256e-05, "loss": 1.0496, "step": 13935 }, { "epoch": 0.6963036963036963, "grad_norm": 2.735604763031006, "learning_rate": 3.099036648147204e-05, "loss": 0.9091, "step": 13940 }, { "epoch": 0.6965534465534465, "grad_norm": 1.6771985292434692, "learning_rate": 3.096488098272083e-05, "loss": 1.0694, "step": 13945 }, { "epoch": 0.6968031968031968, "grad_norm": 1.6571799516677856, "learning_rate": 3.0939395483969625e-05, "loss": 0.9828, "step": 13950 }, { "epoch": 0.6970529470529471, "grad_norm": 1.6640931367874146, "learning_rate": 3.091390998521842e-05, "loss": 1.0485, "step": 13955 }, { "epoch": 0.6973026973026973, "grad_norm": 2.436185121536255, "learning_rate": 3.0888424486467196e-05, "loss": 1.1092, "step": 13960 }, { "epoch": 0.6975524475524476, "grad_norm": 1.4792695045471191, "learning_rate": 3.086293898771599e-05, "loss": 0.9886, "step": 13965 }, { "epoch": 0.6978021978021978, "grad_norm": 1.4432755708694458, "learning_rate": 3.083745348896478e-05, "loss": 1.0601, "step": 13970 }, { "epoch": 0.698051948051948, "grad_norm": 1.9500714540481567, "learning_rate": 3.081196799021357e-05, "loss": 0.969, "step": 13975 }, { "epoch": 0.6983016983016983, "grad_norm": 2.33362078666687, "learning_rate": 3.078648249146236e-05, "loss": 1.0683, "step": 13980 }, { "epoch": 0.6985514485514486, "grad_norm": 1.8991857767105103, "learning_rate": 3.076099699271115e-05, "loss": 1.0179, "step": 13985 }, { "epoch": 0.6988011988011988, "grad_norm": 1.9946887493133545, "learning_rate": 3.073551149395994e-05, "loss": 1.0917, "step": 13990 }, { "epoch": 0.699050949050949, "grad_norm": 3.2796630859375, "learning_rate": 3.071002599520873e-05, "loss": 0.9302, "step": 13995 }, { "epoch": 0.6993006993006993, "grad_norm": 2.526442766189575, "learning_rate": 3.068454049645751e-05, "loss": 0.9662, "step": 14000 }, { "epoch": 0.6995504495504495, "grad_norm": 2.105339288711548, "learning_rate": 3.0659054997706305e-05, "loss": 1.0252, "step": 14005 }, { "epoch": 0.6998001998001998, "grad_norm": 2.0418102741241455, "learning_rate": 3.06335694989551e-05, "loss": 1.0258, "step": 14010 }, { "epoch": 0.7000499500499501, "grad_norm": 1.8781647682189941, "learning_rate": 3.060808400020388e-05, "loss": 1.1253, "step": 14015 }, { "epoch": 0.7002997002997003, "grad_norm": 1.5927292108535767, "learning_rate": 3.0582598501452675e-05, "loss": 1.0687, "step": 14020 }, { "epoch": 0.7005494505494505, "grad_norm": 1.8490709066390991, "learning_rate": 3.055711300270147e-05, "loss": 0.9192, "step": 14025 }, { "epoch": 0.7007992007992008, "grad_norm": 1.984054684638977, "learning_rate": 3.053162750395025e-05, "loss": 1.0716, "step": 14030 }, { "epoch": 0.701048951048951, "grad_norm": 2.2553510665893555, "learning_rate": 3.0506142005199045e-05, "loss": 1.1163, "step": 14035 }, { "epoch": 0.7012987012987013, "grad_norm": 1.96270751953125, "learning_rate": 3.048065650644783e-05, "loss": 1.0477, "step": 14040 }, { "epoch": 0.7015484515484516, "grad_norm": 1.6319586038589478, "learning_rate": 3.045517100769662e-05, "loss": 1.0273, "step": 14045 }, { "epoch": 0.7017982017982018, "grad_norm": 1.7872244119644165, "learning_rate": 3.042968550894541e-05, "loss": 1.0624, "step": 14050 }, { "epoch": 0.702047952047952, "grad_norm": 1.5737003087997437, "learning_rate": 3.04042000101942e-05, "loss": 1.0484, "step": 14055 }, { "epoch": 0.7022977022977023, "grad_norm": 1.5246913433074951, "learning_rate": 3.037871451144299e-05, "loss": 0.9786, "step": 14060 }, { "epoch": 0.7025474525474525, "grad_norm": 2.2539896965026855, "learning_rate": 3.035322901269178e-05, "loss": 0.9067, "step": 14065 }, { "epoch": 0.7027972027972028, "grad_norm": 1.7219593524932861, "learning_rate": 3.032774351394057e-05, "loss": 1.0101, "step": 14070 }, { "epoch": 0.7030469530469531, "grad_norm": 2.2681634426116943, "learning_rate": 3.0302258015189362e-05, "loss": 0.9842, "step": 14075 }, { "epoch": 0.7032967032967034, "grad_norm": 1.7136024236679077, "learning_rate": 3.0276772516438144e-05, "loss": 0.9427, "step": 14080 }, { "epoch": 0.7035464535464535, "grad_norm": 2.1621670722961426, "learning_rate": 3.0251287017686936e-05, "loss": 0.9902, "step": 14085 }, { "epoch": 0.7037962037962038, "grad_norm": 2.059178113937378, "learning_rate": 3.0225801518935725e-05, "loss": 1.0362, "step": 14090 }, { "epoch": 0.704045954045954, "grad_norm": 1.6055564880371094, "learning_rate": 3.0200316020184517e-05, "loss": 1.1093, "step": 14095 }, { "epoch": 0.7042957042957043, "grad_norm": 1.673064947128296, "learning_rate": 3.0174830521433306e-05, "loss": 1.0825, "step": 14100 }, { "epoch": 0.7045454545454546, "grad_norm": 1.6002095937728882, "learning_rate": 3.0149345022682095e-05, "loss": 1.0425, "step": 14105 }, { "epoch": 0.7047952047952047, "grad_norm": 1.6704866886138916, "learning_rate": 3.0123859523930887e-05, "loss": 1.0587, "step": 14110 }, { "epoch": 0.705044955044955, "grad_norm": 1.7277913093566895, "learning_rate": 3.0098374025179676e-05, "loss": 0.9948, "step": 14115 }, { "epoch": 0.7052947052947053, "grad_norm": 1.4267702102661133, "learning_rate": 3.007288852642846e-05, "loss": 0.9109, "step": 14120 }, { "epoch": 0.7055444555444556, "grad_norm": 1.9565128087997437, "learning_rate": 3.004740302767725e-05, "loss": 1.0084, "step": 14125 }, { "epoch": 0.7057942057942058, "grad_norm": 1.8813354969024658, "learning_rate": 3.0021917528926042e-05, "loss": 1.1002, "step": 14130 }, { "epoch": 0.7060439560439561, "grad_norm": 1.640809178352356, "learning_rate": 2.999643203017483e-05, "loss": 1.0268, "step": 14135 }, { "epoch": 0.7062937062937062, "grad_norm": 1.7236136198043823, "learning_rate": 2.9970946531423623e-05, "loss": 1.023, "step": 14140 }, { "epoch": 0.7065434565434565, "grad_norm": 2.0790011882781982, "learning_rate": 2.9945461032672412e-05, "loss": 1.039, "step": 14145 }, { "epoch": 0.7067932067932068, "grad_norm": 2.038616180419922, "learning_rate": 2.99199755339212e-05, "loss": 0.9694, "step": 14150 }, { "epoch": 0.707042957042957, "grad_norm": 1.8063563108444214, "learning_rate": 2.9894490035169993e-05, "loss": 1.0006, "step": 14155 }, { "epoch": 0.7072927072927073, "grad_norm": 2.7222862243652344, "learning_rate": 2.9869004536418775e-05, "loss": 0.9692, "step": 14160 }, { "epoch": 0.7075424575424576, "grad_norm": 1.9761228561401367, "learning_rate": 2.9843519037667567e-05, "loss": 0.9557, "step": 14165 }, { "epoch": 0.7077922077922078, "grad_norm": 2.558563232421875, "learning_rate": 2.9818033538916356e-05, "loss": 0.9851, "step": 14170 }, { "epoch": 0.708041958041958, "grad_norm": 1.7914317846298218, "learning_rate": 2.9792548040165148e-05, "loss": 1.0807, "step": 14175 }, { "epoch": 0.7082917082917083, "grad_norm": 2.248324155807495, "learning_rate": 2.9767062541413937e-05, "loss": 1.0636, "step": 14180 }, { "epoch": 0.7085414585414586, "grad_norm": 1.9643535614013672, "learning_rate": 2.9741577042662726e-05, "loss": 0.9398, "step": 14185 }, { "epoch": 0.7087912087912088, "grad_norm": 1.7862738370895386, "learning_rate": 2.9716091543911518e-05, "loss": 1.0583, "step": 14190 }, { "epoch": 0.7090409590409591, "grad_norm": 1.8958396911621094, "learning_rate": 2.9690606045160307e-05, "loss": 1.0102, "step": 14195 }, { "epoch": 0.7092907092907093, "grad_norm": 1.6259173154830933, "learning_rate": 2.9665120546409092e-05, "loss": 0.9671, "step": 14200 }, { "epoch": 0.7095404595404595, "grad_norm": 2.1652941703796387, "learning_rate": 2.963963504765788e-05, "loss": 1.0108, "step": 14205 }, { "epoch": 0.7097902097902098, "grad_norm": 1.6092095375061035, "learning_rate": 2.9614149548906673e-05, "loss": 1.0008, "step": 14210 }, { "epoch": 0.7100399600399601, "grad_norm": 1.9558817148208618, "learning_rate": 2.9588664050155462e-05, "loss": 1.0167, "step": 14215 }, { "epoch": 0.7102897102897103, "grad_norm": 2.092153549194336, "learning_rate": 2.9563178551404254e-05, "loss": 1.0229, "step": 14220 }, { "epoch": 0.7105394605394605, "grad_norm": 1.703304648399353, "learning_rate": 2.9537693052653043e-05, "loss": 1.0042, "step": 14225 }, { "epoch": 0.7107892107892108, "grad_norm": 1.7278478145599365, "learning_rate": 2.951220755390183e-05, "loss": 0.9625, "step": 14230 }, { "epoch": 0.711038961038961, "grad_norm": 1.6807026863098145, "learning_rate": 2.9486722055150624e-05, "loss": 1.123, "step": 14235 }, { "epoch": 0.7112887112887113, "grad_norm": 1.8805739879608154, "learning_rate": 2.9461236556399406e-05, "loss": 0.9283, "step": 14240 }, { "epoch": 0.7115384615384616, "grad_norm": 1.8017210960388184, "learning_rate": 2.9435751057648198e-05, "loss": 1.0441, "step": 14245 }, { "epoch": 0.7117882117882118, "grad_norm": 1.7521212100982666, "learning_rate": 2.9410265558896987e-05, "loss": 1.0511, "step": 14250 }, { "epoch": 0.712037962037962, "grad_norm": 2.263437509536743, "learning_rate": 2.938478006014578e-05, "loss": 1.0418, "step": 14255 }, { "epoch": 0.7122877122877123, "grad_norm": 1.622531533241272, "learning_rate": 2.9359294561394568e-05, "loss": 1.0778, "step": 14260 }, { "epoch": 0.7125374625374625, "grad_norm": 2.6002800464630127, "learning_rate": 2.9333809062643357e-05, "loss": 0.9917, "step": 14265 }, { "epoch": 0.7127872127872128, "grad_norm": 1.587630271911621, "learning_rate": 2.930832356389215e-05, "loss": 0.9786, "step": 14270 }, { "epoch": 0.7130369630369631, "grad_norm": 1.7498599290847778, "learning_rate": 2.9282838065140938e-05, "loss": 1.0402, "step": 14275 }, { "epoch": 0.7132867132867133, "grad_norm": 1.6948944330215454, "learning_rate": 2.9257352566389723e-05, "loss": 1.0747, "step": 14280 }, { "epoch": 0.7135364635364635, "grad_norm": 1.7894195318222046, "learning_rate": 2.9231867067638512e-05, "loss": 0.9682, "step": 14285 }, { "epoch": 0.7137862137862138, "grad_norm": 1.6690903902053833, "learning_rate": 2.9206381568887304e-05, "loss": 0.9423, "step": 14290 }, { "epoch": 0.714035964035964, "grad_norm": 1.957198143005371, "learning_rate": 2.9180896070136093e-05, "loss": 1.0218, "step": 14295 }, { "epoch": 0.7142857142857143, "grad_norm": 1.942261815071106, "learning_rate": 2.9155410571384885e-05, "loss": 1.0664, "step": 14300 }, { "epoch": 0.7145354645354646, "grad_norm": 2.391418933868408, "learning_rate": 2.9129925072633674e-05, "loss": 1.0371, "step": 14305 }, { "epoch": 0.7147852147852148, "grad_norm": 1.7501574754714966, "learning_rate": 2.9104439573882462e-05, "loss": 0.9763, "step": 14310 }, { "epoch": 0.715034965034965, "grad_norm": 1.9380589723587036, "learning_rate": 2.9078954075131255e-05, "loss": 0.9787, "step": 14315 }, { "epoch": 0.7152847152847153, "grad_norm": 1.9361175298690796, "learning_rate": 2.905346857638004e-05, "loss": 1.0747, "step": 14320 }, { "epoch": 0.7155344655344655, "grad_norm": 2.347571849822998, "learning_rate": 2.902798307762883e-05, "loss": 1.0179, "step": 14325 }, { "epoch": 0.7157842157842158, "grad_norm": 2.0040557384490967, "learning_rate": 2.9002497578877618e-05, "loss": 0.9225, "step": 14330 }, { "epoch": 0.7160339660339661, "grad_norm": 2.631122350692749, "learning_rate": 2.897701208012641e-05, "loss": 1.0866, "step": 14335 }, { "epoch": 0.7162837162837162, "grad_norm": 1.8372197151184082, "learning_rate": 2.89515265813752e-05, "loss": 0.9801, "step": 14340 }, { "epoch": 0.7165334665334665, "grad_norm": 2.3379416465759277, "learning_rate": 2.892604108262399e-05, "loss": 1.0875, "step": 14345 }, { "epoch": 0.7167832167832168, "grad_norm": 2.1197643280029297, "learning_rate": 2.890055558387278e-05, "loss": 1.0271, "step": 14350 }, { "epoch": 0.717032967032967, "grad_norm": 1.6826616525650024, "learning_rate": 2.887507008512157e-05, "loss": 1.0538, "step": 14355 }, { "epoch": 0.7172827172827173, "grad_norm": 2.2886338233947754, "learning_rate": 2.8849584586370354e-05, "loss": 1.0591, "step": 14360 }, { "epoch": 0.7175324675324676, "grad_norm": 2.6664276123046875, "learning_rate": 2.8824099087619143e-05, "loss": 1.0018, "step": 14365 }, { "epoch": 0.7177822177822177, "grad_norm": 1.521835207939148, "learning_rate": 2.8798613588867935e-05, "loss": 0.9668, "step": 14370 }, { "epoch": 0.718031968031968, "grad_norm": 1.9546817541122437, "learning_rate": 2.8773128090116724e-05, "loss": 1.0368, "step": 14375 }, { "epoch": 0.7182817182817183, "grad_norm": 1.6777119636535645, "learning_rate": 2.8747642591365516e-05, "loss": 1.1497, "step": 14380 }, { "epoch": 0.7185314685314685, "grad_norm": 1.7432042360305786, "learning_rate": 2.8722157092614305e-05, "loss": 1.003, "step": 14385 }, { "epoch": 0.7187812187812188, "grad_norm": 1.775637149810791, "learning_rate": 2.8696671593863093e-05, "loss": 0.9574, "step": 14390 }, { "epoch": 0.7190309690309691, "grad_norm": 2.503387212753296, "learning_rate": 2.8671186095111886e-05, "loss": 1.0462, "step": 14395 }, { "epoch": 0.7192807192807192, "grad_norm": 1.9606380462646484, "learning_rate": 2.864570059636067e-05, "loss": 1.105, "step": 14400 }, { "epoch": 0.7195304695304695, "grad_norm": 1.6805543899536133, "learning_rate": 2.862021509760946e-05, "loss": 0.9909, "step": 14405 }, { "epoch": 0.7197802197802198, "grad_norm": 1.9461746215820312, "learning_rate": 2.859472959885825e-05, "loss": 0.9982, "step": 14410 }, { "epoch": 0.72002997002997, "grad_norm": 1.843605637550354, "learning_rate": 2.856924410010704e-05, "loss": 1.0409, "step": 14415 }, { "epoch": 0.7202797202797203, "grad_norm": 1.8383269309997559, "learning_rate": 2.854375860135583e-05, "loss": 1.106, "step": 14420 }, { "epoch": 0.7205294705294706, "grad_norm": 2.051231622695923, "learning_rate": 2.851827310260462e-05, "loss": 1.0838, "step": 14425 }, { "epoch": 0.7207792207792207, "grad_norm": 2.128842353820801, "learning_rate": 2.849278760385341e-05, "loss": 0.9713, "step": 14430 }, { "epoch": 0.721028971028971, "grad_norm": 1.9823460578918457, "learning_rate": 2.84673021051022e-05, "loss": 0.9497, "step": 14435 }, { "epoch": 0.7212787212787213, "grad_norm": 1.7837356328964233, "learning_rate": 2.8441816606350985e-05, "loss": 1.0434, "step": 14440 }, { "epoch": 0.7215284715284715, "grad_norm": 1.7151055335998535, "learning_rate": 2.8416331107599773e-05, "loss": 1.0443, "step": 14445 }, { "epoch": 0.7217782217782218, "grad_norm": 1.996177315711975, "learning_rate": 2.8390845608848566e-05, "loss": 1.0546, "step": 14450 }, { "epoch": 0.722027972027972, "grad_norm": 2.802534580230713, "learning_rate": 2.8365360110097354e-05, "loss": 1.0358, "step": 14455 }, { "epoch": 0.7222777222777222, "grad_norm": 1.8309533596038818, "learning_rate": 2.8339874611346147e-05, "loss": 1.111, "step": 14460 }, { "epoch": 0.7225274725274725, "grad_norm": 1.4934507608413696, "learning_rate": 2.8314389112594935e-05, "loss": 0.9597, "step": 14465 }, { "epoch": 0.7227772227772228, "grad_norm": 2.084134578704834, "learning_rate": 2.8288903613843724e-05, "loss": 0.9463, "step": 14470 }, { "epoch": 0.723026973026973, "grad_norm": 2.1943137645721436, "learning_rate": 2.8263418115092516e-05, "loss": 0.9628, "step": 14475 }, { "epoch": 0.7232767232767233, "grad_norm": 1.6893725395202637, "learning_rate": 2.8237932616341302e-05, "loss": 1.0893, "step": 14480 }, { "epoch": 0.7235264735264735, "grad_norm": 1.9656121730804443, "learning_rate": 2.821244711759009e-05, "loss": 1.0191, "step": 14485 }, { "epoch": 0.7237762237762237, "grad_norm": 2.154719352722168, "learning_rate": 2.818696161883888e-05, "loss": 0.9747, "step": 14490 }, { "epoch": 0.724025974025974, "grad_norm": 2.775186538696289, "learning_rate": 2.816147612008767e-05, "loss": 1.0306, "step": 14495 }, { "epoch": 0.7242757242757243, "grad_norm": 2.064358711242676, "learning_rate": 2.813599062133646e-05, "loss": 1.0851, "step": 14500 }, { "epoch": 0.7245254745254746, "grad_norm": 1.8345537185668945, "learning_rate": 2.8110505122585253e-05, "loss": 1.044, "step": 14505 }, { "epoch": 0.7247752247752248, "grad_norm": 1.9303945302963257, "learning_rate": 2.808501962383404e-05, "loss": 1.0368, "step": 14510 }, { "epoch": 0.725024975024975, "grad_norm": 1.387463092803955, "learning_rate": 2.805953412508283e-05, "loss": 1.0245, "step": 14515 }, { "epoch": 0.7252747252747253, "grad_norm": 1.9885103702545166, "learning_rate": 2.8034048626331616e-05, "loss": 0.9775, "step": 14520 }, { "epoch": 0.7255244755244755, "grad_norm": 1.8873448371887207, "learning_rate": 2.8008563127580408e-05, "loss": 1.0358, "step": 14525 }, { "epoch": 0.7257742257742258, "grad_norm": 1.6910665035247803, "learning_rate": 2.7983077628829197e-05, "loss": 1.0331, "step": 14530 }, { "epoch": 0.7260239760239761, "grad_norm": 1.5750142335891724, "learning_rate": 2.7957592130077985e-05, "loss": 1.0179, "step": 14535 }, { "epoch": 0.7262737262737263, "grad_norm": 2.680988073348999, "learning_rate": 2.7932106631326777e-05, "loss": 0.9196, "step": 14540 }, { "epoch": 0.7265234765234765, "grad_norm": 1.9531195163726807, "learning_rate": 2.7906621132575566e-05, "loss": 1.0005, "step": 14545 }, { "epoch": 0.7267732267732268, "grad_norm": 1.9615342617034912, "learning_rate": 2.788113563382436e-05, "loss": 1.0067, "step": 14550 }, { "epoch": 0.727022977022977, "grad_norm": 1.9541370868682861, "learning_rate": 2.7855650135073147e-05, "loss": 0.9485, "step": 14555 }, { "epoch": 0.7272727272727273, "grad_norm": 2.3990871906280518, "learning_rate": 2.7830164636321933e-05, "loss": 1.0328, "step": 14560 }, { "epoch": 0.7275224775224776, "grad_norm": 1.6806203126907349, "learning_rate": 2.780467913757072e-05, "loss": 0.979, "step": 14565 }, { "epoch": 0.7277722277722277, "grad_norm": 1.8215844631195068, "learning_rate": 2.777919363881951e-05, "loss": 1.0585, "step": 14570 }, { "epoch": 0.728021978021978, "grad_norm": 1.4571136236190796, "learning_rate": 2.7753708140068302e-05, "loss": 0.9062, "step": 14575 }, { "epoch": 0.7282717282717283, "grad_norm": 1.812682867050171, "learning_rate": 2.772822264131709e-05, "loss": 1.0221, "step": 14580 }, { "epoch": 0.7285214785214785, "grad_norm": 2.2830162048339844, "learning_rate": 2.7702737142565883e-05, "loss": 0.9526, "step": 14585 }, { "epoch": 0.7287712287712288, "grad_norm": 1.5272228717803955, "learning_rate": 2.7677251643814672e-05, "loss": 1.0489, "step": 14590 }, { "epoch": 0.7290209790209791, "grad_norm": 1.7774690389633179, "learning_rate": 2.765176614506346e-05, "loss": 1.0345, "step": 14595 }, { "epoch": 0.7292707292707292, "grad_norm": 2.2517940998077393, "learning_rate": 2.7626280646312246e-05, "loss": 0.9773, "step": 14600 }, { "epoch": 0.7295204795204795, "grad_norm": 1.9288583993911743, "learning_rate": 2.760079514756104e-05, "loss": 1.0669, "step": 14605 }, { "epoch": 0.7297702297702298, "grad_norm": 1.9562615156173706, "learning_rate": 2.7575309648809827e-05, "loss": 0.9451, "step": 14610 }, { "epoch": 0.73001998001998, "grad_norm": 2.55831241607666, "learning_rate": 2.7549824150058616e-05, "loss": 1.0051, "step": 14615 }, { "epoch": 0.7302697302697303, "grad_norm": 2.2193422317504883, "learning_rate": 2.752433865130741e-05, "loss": 1.0057, "step": 14620 }, { "epoch": 0.7305194805194806, "grad_norm": 1.6701267957687378, "learning_rate": 2.7498853152556197e-05, "loss": 1.0637, "step": 14625 }, { "epoch": 0.7307692307692307, "grad_norm": 2.4529311656951904, "learning_rate": 2.747336765380499e-05, "loss": 1.0142, "step": 14630 }, { "epoch": 0.731018981018981, "grad_norm": 1.8824009895324707, "learning_rate": 2.7447882155053778e-05, "loss": 1.0901, "step": 14635 }, { "epoch": 0.7312687312687313, "grad_norm": 2.9650866985321045, "learning_rate": 2.7422396656302564e-05, "loss": 1.0787, "step": 14640 }, { "epoch": 0.7315184815184815, "grad_norm": 1.5884532928466797, "learning_rate": 2.7396911157551352e-05, "loss": 1.0224, "step": 14645 }, { "epoch": 0.7317682317682318, "grad_norm": 1.7757060527801514, "learning_rate": 2.737142565880014e-05, "loss": 0.9366, "step": 14650 }, { "epoch": 0.7320179820179821, "grad_norm": 1.9145288467407227, "learning_rate": 2.7345940160048933e-05, "loss": 1.0128, "step": 14655 }, { "epoch": 0.7322677322677322, "grad_norm": 1.6658424139022827, "learning_rate": 2.7320454661297722e-05, "loss": 0.9678, "step": 14660 }, { "epoch": 0.7325174825174825, "grad_norm": 2.4371275901794434, "learning_rate": 2.7294969162546514e-05, "loss": 1.1188, "step": 14665 }, { "epoch": 0.7327672327672328, "grad_norm": 2.5258047580718994, "learning_rate": 2.7269483663795303e-05, "loss": 1.1235, "step": 14670 }, { "epoch": 0.733016983016983, "grad_norm": 1.6292444467544556, "learning_rate": 2.7243998165044092e-05, "loss": 1.0185, "step": 14675 }, { "epoch": 0.7332667332667333, "grad_norm": 2.5887744426727295, "learning_rate": 2.7218512666292877e-05, "loss": 1.1, "step": 14680 }, { "epoch": 0.7335164835164835, "grad_norm": 1.838495135307312, "learning_rate": 2.719302716754167e-05, "loss": 1.0718, "step": 14685 }, { "epoch": 0.7337662337662337, "grad_norm": 1.669089436531067, "learning_rate": 2.7167541668790458e-05, "loss": 1.0343, "step": 14690 }, { "epoch": 0.734015984015984, "grad_norm": 1.767463207244873, "learning_rate": 2.7142056170039247e-05, "loss": 0.9937, "step": 14695 }, { "epoch": 0.7342657342657343, "grad_norm": 2.516292095184326, "learning_rate": 2.711657067128804e-05, "loss": 1.0743, "step": 14700 }, { "epoch": 0.7345154845154845, "grad_norm": 1.7639210224151611, "learning_rate": 2.7091085172536828e-05, "loss": 1.0747, "step": 14705 }, { "epoch": 0.7347652347652348, "grad_norm": 2.2561230659484863, "learning_rate": 2.706559967378562e-05, "loss": 1.0317, "step": 14710 }, { "epoch": 0.735014985014985, "grad_norm": 1.9814558029174805, "learning_rate": 2.704011417503441e-05, "loss": 0.9551, "step": 14715 }, { "epoch": 0.7352647352647352, "grad_norm": 1.996840238571167, "learning_rate": 2.7014628676283194e-05, "loss": 1.0018, "step": 14720 }, { "epoch": 0.7355144855144855, "grad_norm": 1.7547755241394043, "learning_rate": 2.6989143177531983e-05, "loss": 1.0326, "step": 14725 }, { "epoch": 0.7357642357642358, "grad_norm": 2.698049783706665, "learning_rate": 2.6963657678780775e-05, "loss": 1.0428, "step": 14730 }, { "epoch": 0.736013986013986, "grad_norm": 2.1930978298187256, "learning_rate": 2.6938172180029564e-05, "loss": 0.9862, "step": 14735 }, { "epoch": 0.7362637362637363, "grad_norm": 1.8983509540557861, "learning_rate": 2.6912686681278353e-05, "loss": 0.9658, "step": 14740 }, { "epoch": 0.7365134865134865, "grad_norm": 1.7513315677642822, "learning_rate": 2.6887201182527145e-05, "loss": 0.8987, "step": 14745 }, { "epoch": 0.7367632367632367, "grad_norm": 3.446661949157715, "learning_rate": 2.6861715683775934e-05, "loss": 1.0224, "step": 14750 }, { "epoch": 0.737012987012987, "grad_norm": 1.8465672731399536, "learning_rate": 2.6836230185024726e-05, "loss": 1.1314, "step": 14755 }, { "epoch": 0.7372627372627373, "grad_norm": 2.0707101821899414, "learning_rate": 2.6810744686273508e-05, "loss": 1.0055, "step": 14760 }, { "epoch": 0.7375124875124875, "grad_norm": 1.7795567512512207, "learning_rate": 2.67852591875223e-05, "loss": 1.0736, "step": 14765 }, { "epoch": 0.7377622377622378, "grad_norm": 1.7715641260147095, "learning_rate": 2.675977368877109e-05, "loss": 1.0171, "step": 14770 }, { "epoch": 0.738011988011988, "grad_norm": 1.974444031715393, "learning_rate": 2.6734288190019878e-05, "loss": 1.06, "step": 14775 }, { "epoch": 0.7382617382617382, "grad_norm": 1.808599591255188, "learning_rate": 2.670880269126867e-05, "loss": 0.891, "step": 14780 }, { "epoch": 0.7385114885114885, "grad_norm": 1.433605670928955, "learning_rate": 2.668331719251746e-05, "loss": 1.0108, "step": 14785 }, { "epoch": 0.7387612387612388, "grad_norm": 1.593525767326355, "learning_rate": 2.665783169376625e-05, "loss": 0.973, "step": 14790 }, { "epoch": 0.739010989010989, "grad_norm": 1.5444881916046143, "learning_rate": 2.663234619501504e-05, "loss": 1.1085, "step": 14795 }, { "epoch": 0.7392607392607392, "grad_norm": 1.8968780040740967, "learning_rate": 2.6606860696263825e-05, "loss": 1.0216, "step": 14800 }, { "epoch": 0.7395104895104895, "grad_norm": 2.3464462757110596, "learning_rate": 2.6581375197512614e-05, "loss": 0.977, "step": 14805 }, { "epoch": 0.7397602397602397, "grad_norm": 2.2066173553466797, "learning_rate": 2.6555889698761406e-05, "loss": 1.0464, "step": 14810 }, { "epoch": 0.74000999000999, "grad_norm": 2.8609020709991455, "learning_rate": 2.6530404200010195e-05, "loss": 0.9878, "step": 14815 }, { "epoch": 0.7402597402597403, "grad_norm": 1.7715396881103516, "learning_rate": 2.6504918701258984e-05, "loss": 1.0738, "step": 14820 }, { "epoch": 0.7405094905094906, "grad_norm": 1.6767669916152954, "learning_rate": 2.6479433202507776e-05, "loss": 1.0004, "step": 14825 }, { "epoch": 0.7407592407592407, "grad_norm": 1.7766345739364624, "learning_rate": 2.6453947703756565e-05, "loss": 0.9902, "step": 14830 }, { "epoch": 0.741008991008991, "grad_norm": 1.603087306022644, "learning_rate": 2.6428462205005357e-05, "loss": 1.1591, "step": 14835 }, { "epoch": 0.7412587412587412, "grad_norm": 3.3846123218536377, "learning_rate": 2.640297670625414e-05, "loss": 1.0255, "step": 14840 }, { "epoch": 0.7415084915084915, "grad_norm": 1.8698452711105347, "learning_rate": 2.637749120750293e-05, "loss": 0.9733, "step": 14845 }, { "epoch": 0.7417582417582418, "grad_norm": 2.445197820663452, "learning_rate": 2.635200570875172e-05, "loss": 0.9989, "step": 14850 }, { "epoch": 0.7420079920079921, "grad_norm": 2.424600839614868, "learning_rate": 2.632652021000051e-05, "loss": 1.0294, "step": 14855 }, { "epoch": 0.7422577422577422, "grad_norm": 1.5679882764816284, "learning_rate": 2.63010347112493e-05, "loss": 0.9789, "step": 14860 }, { "epoch": 0.7425074925074925, "grad_norm": 1.8506940603256226, "learning_rate": 2.627554921249809e-05, "loss": 1.0118, "step": 14865 }, { "epoch": 0.7427572427572428, "grad_norm": 2.2714459896087646, "learning_rate": 2.6250063713746882e-05, "loss": 1.0796, "step": 14870 }, { "epoch": 0.743006993006993, "grad_norm": 2.404691219329834, "learning_rate": 2.622457821499567e-05, "loss": 0.9667, "step": 14875 }, { "epoch": 0.7432567432567433, "grad_norm": 1.92850661277771, "learning_rate": 2.6199092716244456e-05, "loss": 1.0163, "step": 14880 }, { "epoch": 0.7435064935064936, "grad_norm": 2.0576303005218506, "learning_rate": 2.6173607217493245e-05, "loss": 0.9455, "step": 14885 }, { "epoch": 0.7437562437562437, "grad_norm": 1.733375072479248, "learning_rate": 2.6148121718742037e-05, "loss": 0.9456, "step": 14890 }, { "epoch": 0.744005994005994, "grad_norm": 2.1487326622009277, "learning_rate": 2.6122636219990826e-05, "loss": 0.9049, "step": 14895 }, { "epoch": 0.7442557442557443, "grad_norm": 1.9882726669311523, "learning_rate": 2.6097150721239615e-05, "loss": 0.8506, "step": 14900 }, { "epoch": 0.7445054945054945, "grad_norm": 2.653013229370117, "learning_rate": 2.6071665222488407e-05, "loss": 0.9709, "step": 14905 }, { "epoch": 0.7447552447552448, "grad_norm": 1.644567608833313, "learning_rate": 2.6046179723737196e-05, "loss": 1.0731, "step": 14910 }, { "epoch": 0.745004995004995, "grad_norm": 1.6906607151031494, "learning_rate": 2.6020694224985988e-05, "loss": 1.0248, "step": 14915 }, { "epoch": 0.7452547452547452, "grad_norm": 1.620770812034607, "learning_rate": 2.599520872623477e-05, "loss": 0.9823, "step": 14920 }, { "epoch": 0.7455044955044955, "grad_norm": 1.7297226190567017, "learning_rate": 2.5969723227483562e-05, "loss": 1.0761, "step": 14925 }, { "epoch": 0.7457542457542458, "grad_norm": 1.4803812503814697, "learning_rate": 2.594423772873235e-05, "loss": 1.0595, "step": 14930 }, { "epoch": 0.746003996003996, "grad_norm": 2.0148537158966064, "learning_rate": 2.5918752229981143e-05, "loss": 0.9188, "step": 14935 }, { "epoch": 0.7462537462537463, "grad_norm": 1.9448295831680298, "learning_rate": 2.5893266731229932e-05, "loss": 0.9605, "step": 14940 }, { "epoch": 0.7465034965034965, "grad_norm": 2.398226261138916, "learning_rate": 2.586778123247872e-05, "loss": 1.0022, "step": 14945 }, { "epoch": 0.7467532467532467, "grad_norm": 1.761758804321289, "learning_rate": 2.5842295733727513e-05, "loss": 1.0508, "step": 14950 }, { "epoch": 0.747002997002997, "grad_norm": 1.5980178117752075, "learning_rate": 2.58168102349763e-05, "loss": 1.0401, "step": 14955 }, { "epoch": 0.7472527472527473, "grad_norm": 1.8886481523513794, "learning_rate": 2.5791324736225087e-05, "loss": 1.0057, "step": 14960 }, { "epoch": 0.7475024975024975, "grad_norm": 1.6918638944625854, "learning_rate": 2.5765839237473876e-05, "loss": 0.9532, "step": 14965 }, { "epoch": 0.7477522477522478, "grad_norm": 1.6452327966690063, "learning_rate": 2.5740353738722668e-05, "loss": 1.0583, "step": 14970 }, { "epoch": 0.748001998001998, "grad_norm": 6.248218059539795, "learning_rate": 2.5714868239971457e-05, "loss": 1.0363, "step": 14975 }, { "epoch": 0.7482517482517482, "grad_norm": 2.4693377017974854, "learning_rate": 2.5689382741220246e-05, "loss": 1.0451, "step": 14980 }, { "epoch": 0.7485014985014985, "grad_norm": 1.718875765800476, "learning_rate": 2.5663897242469038e-05, "loss": 1.0284, "step": 14985 }, { "epoch": 0.7487512487512488, "grad_norm": 1.845837116241455, "learning_rate": 2.5638411743717827e-05, "loss": 1.0286, "step": 14990 }, { "epoch": 0.749000999000999, "grad_norm": 2.112431287765503, "learning_rate": 2.561292624496662e-05, "loss": 0.9979, "step": 14995 }, { "epoch": 0.7492507492507493, "grad_norm": 1.8080332279205322, "learning_rate": 2.55874407462154e-05, "loss": 0.9955, "step": 15000 }, { "epoch": 0.7495004995004995, "grad_norm": 1.9026358127593994, "learning_rate": 2.5561955247464193e-05, "loss": 1.0361, "step": 15005 }, { "epoch": 0.7497502497502497, "grad_norm": 2.113414764404297, "learning_rate": 2.553646974871298e-05, "loss": 1.0253, "step": 15010 }, { "epoch": 0.75, "grad_norm": 1.797728419303894, "learning_rate": 2.5510984249961774e-05, "loss": 0.8827, "step": 15015 }, { "epoch": 0.7502497502497503, "grad_norm": 1.7767447233200073, "learning_rate": 2.5485498751210563e-05, "loss": 1.0478, "step": 15020 }, { "epoch": 0.7504995004995005, "grad_norm": 1.8877149820327759, "learning_rate": 2.546001325245935e-05, "loss": 1.0304, "step": 15025 }, { "epoch": 0.7507492507492507, "grad_norm": 1.9377466440200806, "learning_rate": 2.5434527753708144e-05, "loss": 1.0043, "step": 15030 }, { "epoch": 0.750999000999001, "grad_norm": 2.4546878337860107, "learning_rate": 2.5409042254956932e-05, "loss": 1.1059, "step": 15035 }, { "epoch": 0.7512487512487512, "grad_norm": 2.3053324222564697, "learning_rate": 2.5383556756205718e-05, "loss": 0.9501, "step": 15040 }, { "epoch": 0.7514985014985015, "grad_norm": 1.9901001453399658, "learning_rate": 2.5358071257454507e-05, "loss": 1.0041, "step": 15045 }, { "epoch": 0.7517482517482518, "grad_norm": 1.5859912633895874, "learning_rate": 2.53325857587033e-05, "loss": 1.0625, "step": 15050 }, { "epoch": 0.751998001998002, "grad_norm": 2.0559613704681396, "learning_rate": 2.5307100259952088e-05, "loss": 0.9502, "step": 15055 }, { "epoch": 0.7522477522477522, "grad_norm": 1.8973246812820435, "learning_rate": 2.5281614761200876e-05, "loss": 1.0038, "step": 15060 }, { "epoch": 0.7524975024975025, "grad_norm": 1.802026391029358, "learning_rate": 2.525612926244967e-05, "loss": 1.0733, "step": 15065 }, { "epoch": 0.7527472527472527, "grad_norm": 1.9954891204833984, "learning_rate": 2.5230643763698457e-05, "loss": 0.913, "step": 15070 }, { "epoch": 0.752997002997003, "grad_norm": 2.296133518218994, "learning_rate": 2.520515826494725e-05, "loss": 0.9943, "step": 15075 }, { "epoch": 0.7532467532467533, "grad_norm": 1.816089153289795, "learning_rate": 2.517967276619603e-05, "loss": 0.9656, "step": 15080 }, { "epoch": 0.7534965034965035, "grad_norm": 1.898628830909729, "learning_rate": 2.5154187267444824e-05, "loss": 0.9644, "step": 15085 }, { "epoch": 0.7537462537462537, "grad_norm": 2.004977226257324, "learning_rate": 2.5128701768693613e-05, "loss": 0.9315, "step": 15090 }, { "epoch": 0.753996003996004, "grad_norm": 2.29707407951355, "learning_rate": 2.5103216269942405e-05, "loss": 0.9143, "step": 15095 }, { "epoch": 0.7542457542457542, "grad_norm": 1.7582751512527466, "learning_rate": 2.5077730771191194e-05, "loss": 1.0398, "step": 15100 }, { "epoch": 0.7544955044955045, "grad_norm": 1.892975091934204, "learning_rate": 2.5052245272439982e-05, "loss": 1.0368, "step": 15105 }, { "epoch": 0.7547452547452548, "grad_norm": 1.704008936882019, "learning_rate": 2.5026759773688774e-05, "loss": 1.081, "step": 15110 }, { "epoch": 0.754995004995005, "grad_norm": 2.0476465225219727, "learning_rate": 2.5001274274937563e-05, "loss": 1.0325, "step": 15115 }, { "epoch": 0.7552447552447552, "grad_norm": 2.072239637374878, "learning_rate": 2.4975788776186352e-05, "loss": 1.0548, "step": 15120 }, { "epoch": 0.7554945054945055, "grad_norm": 1.5687493085861206, "learning_rate": 2.495030327743514e-05, "loss": 1.0676, "step": 15125 }, { "epoch": 0.7557442557442557, "grad_norm": 2.0525381565093994, "learning_rate": 2.492481777868393e-05, "loss": 1.0019, "step": 15130 }, { "epoch": 0.755994005994006, "grad_norm": 2.0736782550811768, "learning_rate": 2.489933227993272e-05, "loss": 1.046, "step": 15135 }, { "epoch": 0.7562437562437563, "grad_norm": 1.7794342041015625, "learning_rate": 2.487384678118151e-05, "loss": 1.0477, "step": 15140 }, { "epoch": 0.7564935064935064, "grad_norm": 1.543826699256897, "learning_rate": 2.48483612824303e-05, "loss": 1.0568, "step": 15145 }, { "epoch": 0.7567432567432567, "grad_norm": 2.4219722747802734, "learning_rate": 2.4822875783679085e-05, "loss": 0.9393, "step": 15150 }, { "epoch": 0.756993006993007, "grad_norm": 2.0359227657318115, "learning_rate": 2.4797390284927877e-05, "loss": 0.9215, "step": 15155 }, { "epoch": 0.7572427572427572, "grad_norm": 1.7204830646514893, "learning_rate": 2.4771904786176666e-05, "loss": 1.0684, "step": 15160 }, { "epoch": 0.7574925074925075, "grad_norm": 2.678483247756958, "learning_rate": 2.4746419287425458e-05, "loss": 0.9481, "step": 15165 }, { "epoch": 0.7577422577422578, "grad_norm": 2.0561625957489014, "learning_rate": 2.4720933788674243e-05, "loss": 1.1058, "step": 15170 }, { "epoch": 0.7579920079920079, "grad_norm": 2.426623821258545, "learning_rate": 2.4695448289923036e-05, "loss": 1.13, "step": 15175 }, { "epoch": 0.7582417582417582, "grad_norm": 1.6082041263580322, "learning_rate": 2.4669962791171824e-05, "loss": 1.0288, "step": 15180 }, { "epoch": 0.7584915084915085, "grad_norm": 1.3989824056625366, "learning_rate": 2.4644477292420613e-05, "loss": 1.0555, "step": 15185 }, { "epoch": 0.7587412587412588, "grad_norm": 1.7953685522079468, "learning_rate": 2.4618991793669402e-05, "loss": 0.9081, "step": 15190 }, { "epoch": 0.758991008991009, "grad_norm": 2.366793632507324, "learning_rate": 2.459350629491819e-05, "loss": 0.9873, "step": 15195 }, { "epoch": 0.7592407592407593, "grad_norm": 2.2210183143615723, "learning_rate": 2.4568020796166983e-05, "loss": 1.0286, "step": 15200 }, { "epoch": 0.7594905094905094, "grad_norm": 1.6560128927230835, "learning_rate": 2.4542535297415772e-05, "loss": 1.007, "step": 15205 }, { "epoch": 0.7597402597402597, "grad_norm": 1.7811689376831055, "learning_rate": 2.451704979866456e-05, "loss": 0.9741, "step": 15210 }, { "epoch": 0.75999000999001, "grad_norm": 1.7394849061965942, "learning_rate": 2.449156429991335e-05, "loss": 1.0384, "step": 15215 }, { "epoch": 0.7602397602397603, "grad_norm": 1.9815977811813354, "learning_rate": 2.446607880116214e-05, "loss": 1.0186, "step": 15220 }, { "epoch": 0.7604895104895105, "grad_norm": 2.597395896911621, "learning_rate": 2.444059330241093e-05, "loss": 0.9826, "step": 15225 }, { "epoch": 0.7607392607392608, "grad_norm": 2.2310986518859863, "learning_rate": 2.441510780365972e-05, "loss": 1.0167, "step": 15230 }, { "epoch": 0.760989010989011, "grad_norm": 1.9318314790725708, "learning_rate": 2.4389622304908508e-05, "loss": 1.1166, "step": 15235 }, { "epoch": 0.7612387612387612, "grad_norm": 2.3555150032043457, "learning_rate": 2.4364136806157297e-05, "loss": 1.0146, "step": 15240 }, { "epoch": 0.7614885114885115, "grad_norm": 1.6067463159561157, "learning_rate": 2.433865130740609e-05, "loss": 1.0844, "step": 15245 }, { "epoch": 0.7617382617382618, "grad_norm": 1.7338831424713135, "learning_rate": 2.4313165808654874e-05, "loss": 1.0729, "step": 15250 }, { "epoch": 0.761988011988012, "grad_norm": 2.1199355125427246, "learning_rate": 2.4287680309903666e-05, "loss": 0.9865, "step": 15255 }, { "epoch": 0.7622377622377622, "grad_norm": 2.0799319744110107, "learning_rate": 2.4262194811152455e-05, "loss": 0.9642, "step": 15260 }, { "epoch": 0.7624875124875125, "grad_norm": 3.255760431289673, "learning_rate": 2.4236709312401244e-05, "loss": 1.0158, "step": 15265 }, { "epoch": 0.7627372627372627, "grad_norm": 1.7878175973892212, "learning_rate": 2.4211223813650033e-05, "loss": 1.0091, "step": 15270 }, { "epoch": 0.762987012987013, "grad_norm": 1.9151753187179565, "learning_rate": 2.418573831489882e-05, "loss": 1.0658, "step": 15275 }, { "epoch": 0.7632367632367633, "grad_norm": 2.2587521076202393, "learning_rate": 2.4160252816147614e-05, "loss": 0.9677, "step": 15280 }, { "epoch": 0.7634865134865135, "grad_norm": 2.1629271507263184, "learning_rate": 2.4134767317396403e-05, "loss": 1.0577, "step": 15285 }, { "epoch": 0.7637362637362637, "grad_norm": 1.9463040828704834, "learning_rate": 2.410928181864519e-05, "loss": 0.9118, "step": 15290 }, { "epoch": 0.763986013986014, "grad_norm": 2.0208358764648438, "learning_rate": 2.408379631989398e-05, "loss": 0.9256, "step": 15295 }, { "epoch": 0.7642357642357642, "grad_norm": 2.08180832862854, "learning_rate": 2.4058310821142772e-05, "loss": 0.9746, "step": 15300 }, { "epoch": 0.7644855144855145, "grad_norm": 2.05511474609375, "learning_rate": 2.403282532239156e-05, "loss": 1.0326, "step": 15305 }, { "epoch": 0.7647352647352648, "grad_norm": 2.2029857635498047, "learning_rate": 2.400733982364035e-05, "loss": 1.1053, "step": 15310 }, { "epoch": 0.764985014985015, "grad_norm": 2.373384714126587, "learning_rate": 2.398185432488914e-05, "loss": 1.0849, "step": 15315 }, { "epoch": 0.7652347652347652, "grad_norm": 1.4183902740478516, "learning_rate": 2.3956368826137928e-05, "loss": 1.0506, "step": 15320 }, { "epoch": 0.7654845154845155, "grad_norm": 1.7268801927566528, "learning_rate": 2.393088332738672e-05, "loss": 1.0672, "step": 15325 }, { "epoch": 0.7657342657342657, "grad_norm": 1.764801025390625, "learning_rate": 2.3905397828635505e-05, "loss": 1.1716, "step": 15330 }, { "epoch": 0.765984015984016, "grad_norm": 2.458540201187134, "learning_rate": 2.3879912329884297e-05, "loss": 1.0068, "step": 15335 }, { "epoch": 0.7662337662337663, "grad_norm": 1.9659291505813599, "learning_rate": 2.3854426831133086e-05, "loss": 1.0437, "step": 15340 }, { "epoch": 0.7664835164835165, "grad_norm": 1.640433430671692, "learning_rate": 2.3828941332381878e-05, "loss": 0.9288, "step": 15345 }, { "epoch": 0.7667332667332667, "grad_norm": 3.4648897647857666, "learning_rate": 2.3803455833630664e-05, "loss": 0.9916, "step": 15350 }, { "epoch": 0.766983016983017, "grad_norm": 1.6848660707473755, "learning_rate": 2.3777970334879452e-05, "loss": 0.932, "step": 15355 }, { "epoch": 0.7672327672327672, "grad_norm": 1.8433504104614258, "learning_rate": 2.3752484836128245e-05, "loss": 1.0301, "step": 15360 }, { "epoch": 0.7674825174825175, "grad_norm": 2.5859153270721436, "learning_rate": 2.3726999337377033e-05, "loss": 1.1105, "step": 15365 }, { "epoch": 0.7677322677322678, "grad_norm": 1.6039913892745972, "learning_rate": 2.3701513838625822e-05, "loss": 1.0467, "step": 15370 }, { "epoch": 0.7679820179820179, "grad_norm": 2.428699016571045, "learning_rate": 2.367602833987461e-05, "loss": 0.9836, "step": 15375 }, { "epoch": 0.7682317682317682, "grad_norm": 1.7424918413162231, "learning_rate": 2.3650542841123403e-05, "loss": 0.9971, "step": 15380 }, { "epoch": 0.7684815184815185, "grad_norm": 2.636143445968628, "learning_rate": 2.3625057342372192e-05, "loss": 0.9494, "step": 15385 }, { "epoch": 0.7687312687312687, "grad_norm": 2.1716830730438232, "learning_rate": 2.359957184362098e-05, "loss": 0.9425, "step": 15390 }, { "epoch": 0.768981018981019, "grad_norm": 2.3186094760894775, "learning_rate": 2.357408634486977e-05, "loss": 1.0544, "step": 15395 }, { "epoch": 0.7692307692307693, "grad_norm": 1.8173426389694214, "learning_rate": 2.354860084611856e-05, "loss": 1.0287, "step": 15400 }, { "epoch": 0.7694805194805194, "grad_norm": 1.618956446647644, "learning_rate": 2.352311534736735e-05, "loss": 1.0365, "step": 15405 }, { "epoch": 0.7697302697302697, "grad_norm": 2.1829471588134766, "learning_rate": 2.3497629848616136e-05, "loss": 1.063, "step": 15410 }, { "epoch": 0.76998001998002, "grad_norm": 2.0276670455932617, "learning_rate": 2.3472144349864928e-05, "loss": 1.1562, "step": 15415 }, { "epoch": 0.7702297702297702, "grad_norm": 2.1757924556732178, "learning_rate": 2.3446658851113717e-05, "loss": 0.8825, "step": 15420 }, { "epoch": 0.7704795204795205, "grad_norm": 1.6080944538116455, "learning_rate": 2.342117335236251e-05, "loss": 0.9582, "step": 15425 }, { "epoch": 0.7707292707292708, "grad_norm": 1.880242109298706, "learning_rate": 2.3395687853611295e-05, "loss": 1.0768, "step": 15430 }, { "epoch": 0.7709790209790209, "grad_norm": 2.3536715507507324, "learning_rate": 2.3370202354860087e-05, "loss": 1.0068, "step": 15435 }, { "epoch": 0.7712287712287712, "grad_norm": 1.5416983366012573, "learning_rate": 2.3344716856108876e-05, "loss": 1.0413, "step": 15440 }, { "epoch": 0.7714785214785215, "grad_norm": 1.6107654571533203, "learning_rate": 2.3319231357357664e-05, "loss": 1.0709, "step": 15445 }, { "epoch": 0.7717282717282717, "grad_norm": 2.1901206970214844, "learning_rate": 2.3293745858606453e-05, "loss": 0.9991, "step": 15450 }, { "epoch": 0.771978021978022, "grad_norm": 1.4559460878372192, "learning_rate": 2.3268260359855242e-05, "loss": 0.9921, "step": 15455 }, { "epoch": 0.7722277722277723, "grad_norm": 1.633248209953308, "learning_rate": 2.3242774861104034e-05, "loss": 1.0436, "step": 15460 }, { "epoch": 0.7724775224775224, "grad_norm": 1.7177588939666748, "learning_rate": 2.3217289362352823e-05, "loss": 0.9908, "step": 15465 }, { "epoch": 0.7727272727272727, "grad_norm": 1.6743370294570923, "learning_rate": 2.319180386360161e-05, "loss": 0.9552, "step": 15470 }, { "epoch": 0.772977022977023, "grad_norm": 1.3630130290985107, "learning_rate": 2.31663183648504e-05, "loss": 0.9541, "step": 15475 }, { "epoch": 0.7732267732267732, "grad_norm": 1.9816889762878418, "learning_rate": 2.314083286609919e-05, "loss": 1.0314, "step": 15480 }, { "epoch": 0.7734765234765235, "grad_norm": 1.5740892887115479, "learning_rate": 2.311534736734798e-05, "loss": 1.0145, "step": 15485 }, { "epoch": 0.7737262737262737, "grad_norm": 1.9496132135391235, "learning_rate": 2.3089861868596767e-05, "loss": 1.0722, "step": 15490 }, { "epoch": 0.7739760239760239, "grad_norm": 1.861145257949829, "learning_rate": 2.306437636984556e-05, "loss": 1.1014, "step": 15495 }, { "epoch": 0.7742257742257742, "grad_norm": 1.5514187812805176, "learning_rate": 2.3038890871094348e-05, "loss": 0.9852, "step": 15500 }, { "epoch": 0.7744755244755245, "grad_norm": 1.5909210443496704, "learning_rate": 2.301340537234314e-05, "loss": 1.0416, "step": 15505 }, { "epoch": 0.7747252747252747, "grad_norm": 2.22900652885437, "learning_rate": 2.2987919873591925e-05, "loss": 0.9811, "step": 15510 }, { "epoch": 0.774975024975025, "grad_norm": 2.2890470027923584, "learning_rate": 2.2962434374840718e-05, "loss": 0.9804, "step": 15515 }, { "epoch": 0.7752247752247752, "grad_norm": 2.7346317768096924, "learning_rate": 2.2936948876089506e-05, "loss": 1.0433, "step": 15520 }, { "epoch": 0.7754745254745254, "grad_norm": 1.807704210281372, "learning_rate": 2.2911463377338295e-05, "loss": 1.057, "step": 15525 }, { "epoch": 0.7757242757242757, "grad_norm": 2.7745885848999023, "learning_rate": 2.2885977878587084e-05, "loss": 1.0461, "step": 15530 }, { "epoch": 0.775974025974026, "grad_norm": 1.667669415473938, "learning_rate": 2.2860492379835873e-05, "loss": 1.0019, "step": 15535 }, { "epoch": 0.7762237762237763, "grad_norm": 1.7263712882995605, "learning_rate": 2.2835006881084665e-05, "loss": 1.0836, "step": 15540 }, { "epoch": 0.7764735264735265, "grad_norm": 1.855435848236084, "learning_rate": 2.2809521382333454e-05, "loss": 1.0623, "step": 15545 }, { "epoch": 0.7767232767232767, "grad_norm": 1.8728913068771362, "learning_rate": 2.2784035883582243e-05, "loss": 0.9931, "step": 15550 }, { "epoch": 0.776973026973027, "grad_norm": 1.9231921434402466, "learning_rate": 2.275855038483103e-05, "loss": 1.057, "step": 15555 }, { "epoch": 0.7772227772227772, "grad_norm": 1.90082848072052, "learning_rate": 2.273306488607982e-05, "loss": 0.9695, "step": 15560 }, { "epoch": 0.7774725274725275, "grad_norm": 3.0355005264282227, "learning_rate": 2.2707579387328612e-05, "loss": 1.0403, "step": 15565 }, { "epoch": 0.7777222777222778, "grad_norm": 1.7970954179763794, "learning_rate": 2.2682093888577398e-05, "loss": 0.9926, "step": 15570 }, { "epoch": 0.777972027972028, "grad_norm": 2.1206209659576416, "learning_rate": 2.265660838982619e-05, "loss": 1.0683, "step": 15575 }, { "epoch": 0.7782217782217782, "grad_norm": 1.7014373540878296, "learning_rate": 2.263112289107498e-05, "loss": 1.0146, "step": 15580 }, { "epoch": 0.7784715284715285, "grad_norm": 1.710180640220642, "learning_rate": 2.260563739232377e-05, "loss": 0.95, "step": 15585 }, { "epoch": 0.7787212787212787, "grad_norm": 1.7480230331420898, "learning_rate": 2.2580151893572556e-05, "loss": 0.9718, "step": 15590 }, { "epoch": 0.778971028971029, "grad_norm": 1.8487499952316284, "learning_rate": 2.255466639482135e-05, "loss": 1.0341, "step": 15595 }, { "epoch": 0.7792207792207793, "grad_norm": 2.2541635036468506, "learning_rate": 2.2529180896070137e-05, "loss": 0.932, "step": 15600 }, { "epoch": 0.7794705294705294, "grad_norm": 1.461875319480896, "learning_rate": 2.2503695397318926e-05, "loss": 1.0688, "step": 15605 }, { "epoch": 0.7797202797202797, "grad_norm": 1.8443405628204346, "learning_rate": 2.2478209898567715e-05, "loss": 0.8591, "step": 15610 }, { "epoch": 0.77997002997003, "grad_norm": 2.1003522872924805, "learning_rate": 2.2452724399816504e-05, "loss": 0.9902, "step": 15615 }, { "epoch": 0.7802197802197802, "grad_norm": 1.5168603658676147, "learning_rate": 2.2427238901065296e-05, "loss": 0.9989, "step": 15620 }, { "epoch": 0.7804695304695305, "grad_norm": 3.0858969688415527, "learning_rate": 2.2401753402314085e-05, "loss": 0.9572, "step": 15625 }, { "epoch": 0.7807192807192808, "grad_norm": 1.5143353939056396, "learning_rate": 2.2376267903562873e-05, "loss": 0.9955, "step": 15630 }, { "epoch": 0.7809690309690309, "grad_norm": 1.8788729906082153, "learning_rate": 2.2350782404811662e-05, "loss": 1.061, "step": 15635 }, { "epoch": 0.7812187812187812, "grad_norm": 1.888753056526184, "learning_rate": 2.2325296906060454e-05, "loss": 1.0046, "step": 15640 }, { "epoch": 0.7814685314685315, "grad_norm": 1.7865486145019531, "learning_rate": 2.2299811407309243e-05, "loss": 0.8709, "step": 15645 }, { "epoch": 0.7817182817182817, "grad_norm": 2.63456392288208, "learning_rate": 2.227432590855803e-05, "loss": 1.0016, "step": 15650 }, { "epoch": 0.781968031968032, "grad_norm": 2.3182995319366455, "learning_rate": 2.224884040980682e-05, "loss": 1.052, "step": 15655 }, { "epoch": 0.7822177822177823, "grad_norm": 2.2426466941833496, "learning_rate": 2.222335491105561e-05, "loss": 1.0466, "step": 15660 }, { "epoch": 0.7824675324675324, "grad_norm": 2.1995458602905273, "learning_rate": 2.2197869412304402e-05, "loss": 1.0554, "step": 15665 }, { "epoch": 0.7827172827172827, "grad_norm": 1.7473052740097046, "learning_rate": 2.2172383913553187e-05, "loss": 0.997, "step": 15670 }, { "epoch": 0.782967032967033, "grad_norm": 2.0480690002441406, "learning_rate": 2.214689841480198e-05, "loss": 0.9212, "step": 15675 }, { "epoch": 0.7832167832167832, "grad_norm": 1.8048025369644165, "learning_rate": 2.2121412916050768e-05, "loss": 1.0251, "step": 15680 }, { "epoch": 0.7834665334665335, "grad_norm": 1.9406696557998657, "learning_rate": 2.2095927417299557e-05, "loss": 1.0465, "step": 15685 }, { "epoch": 0.7837162837162838, "grad_norm": 2.519575357437134, "learning_rate": 2.2070441918548346e-05, "loss": 1.0555, "step": 15690 }, { "epoch": 0.7839660339660339, "grad_norm": 2.858858346939087, "learning_rate": 2.2044956419797134e-05, "loss": 1.0292, "step": 15695 }, { "epoch": 0.7842157842157842, "grad_norm": 1.5845837593078613, "learning_rate": 2.2019470921045927e-05, "loss": 1.0958, "step": 15700 }, { "epoch": 0.7844655344655345, "grad_norm": 1.536982774734497, "learning_rate": 2.1993985422294715e-05, "loss": 1.0288, "step": 15705 }, { "epoch": 0.7847152847152847, "grad_norm": 3.0955801010131836, "learning_rate": 2.1968499923543504e-05, "loss": 1.0282, "step": 15710 }, { "epoch": 0.784965034965035, "grad_norm": 1.390059471130371, "learning_rate": 2.1943014424792293e-05, "loss": 1.0953, "step": 15715 }, { "epoch": 0.7852147852147852, "grad_norm": 1.753387212753296, "learning_rate": 2.1917528926041085e-05, "loss": 1.0013, "step": 15720 }, { "epoch": 0.7854645354645354, "grad_norm": 2.6951074600219727, "learning_rate": 2.1892043427289874e-05, "loss": 1.0762, "step": 15725 }, { "epoch": 0.7857142857142857, "grad_norm": 1.744352102279663, "learning_rate": 2.1866557928538663e-05, "loss": 1.1373, "step": 15730 }, { "epoch": 0.785964035964036, "grad_norm": 1.5988683700561523, "learning_rate": 2.184107242978745e-05, "loss": 1.0265, "step": 15735 }, { "epoch": 0.7862137862137862, "grad_norm": 2.2333109378814697, "learning_rate": 2.181558693103624e-05, "loss": 0.9935, "step": 15740 }, { "epoch": 0.7864635364635365, "grad_norm": 1.6030211448669434, "learning_rate": 2.1790101432285033e-05, "loss": 0.9373, "step": 15745 }, { "epoch": 0.7867132867132867, "grad_norm": 2.7056639194488525, "learning_rate": 2.1764615933533818e-05, "loss": 0.9674, "step": 15750 }, { "epoch": 0.7869630369630369, "grad_norm": 2.191183090209961, "learning_rate": 2.173913043478261e-05, "loss": 0.9606, "step": 15755 }, { "epoch": 0.7872127872127872, "grad_norm": 1.858770489692688, "learning_rate": 2.17136449360314e-05, "loss": 0.9102, "step": 15760 }, { "epoch": 0.7874625374625375, "grad_norm": 2.663455009460449, "learning_rate": 2.1688159437280188e-05, "loss": 0.946, "step": 15765 }, { "epoch": 0.7877122877122877, "grad_norm": 3.0094313621520996, "learning_rate": 2.1662673938528977e-05, "loss": 1.1006, "step": 15770 }, { "epoch": 0.787962037962038, "grad_norm": 2.3532984256744385, "learning_rate": 2.1637188439777765e-05, "loss": 1.0059, "step": 15775 }, { "epoch": 0.7882117882117882, "grad_norm": 1.955659031867981, "learning_rate": 2.1611702941026558e-05, "loss": 0.904, "step": 15780 }, { "epoch": 0.7884615384615384, "grad_norm": 1.877435326576233, "learning_rate": 2.1586217442275346e-05, "loss": 0.8591, "step": 15785 }, { "epoch": 0.7887112887112887, "grad_norm": 1.4866602420806885, "learning_rate": 2.1560731943524135e-05, "loss": 1.0059, "step": 15790 }, { "epoch": 0.788961038961039, "grad_norm": 2.380725383758545, "learning_rate": 2.1535246444772924e-05, "loss": 1.0461, "step": 15795 }, { "epoch": 0.7892107892107892, "grad_norm": 1.7922388315200806, "learning_rate": 2.1509760946021716e-05, "loss": 1.054, "step": 15800 }, { "epoch": 0.7894605394605395, "grad_norm": 1.7782460451126099, "learning_rate": 2.1484275447270505e-05, "loss": 0.9943, "step": 15805 }, { "epoch": 0.7897102897102897, "grad_norm": 1.9085813760757446, "learning_rate": 2.1458789948519294e-05, "loss": 1.0219, "step": 15810 }, { "epoch": 0.7899600399600399, "grad_norm": 1.701112151145935, "learning_rate": 2.1433304449768082e-05, "loss": 1.0696, "step": 15815 }, { "epoch": 0.7902097902097902, "grad_norm": 1.615795612335205, "learning_rate": 2.140781895101687e-05, "loss": 1.0462, "step": 15820 }, { "epoch": 0.7904595404595405, "grad_norm": 1.9642367362976074, "learning_rate": 2.1382333452265663e-05, "loss": 1.092, "step": 15825 }, { "epoch": 0.7907092907092907, "grad_norm": 1.389542818069458, "learning_rate": 2.135684795351445e-05, "loss": 1.0274, "step": 15830 }, { "epoch": 0.7909590409590409, "grad_norm": 2.1921679973602295, "learning_rate": 2.133136245476324e-05, "loss": 0.9733, "step": 15835 }, { "epoch": 0.7912087912087912, "grad_norm": 2.5733556747436523, "learning_rate": 2.130587695601203e-05, "loss": 1.0237, "step": 15840 }, { "epoch": 0.7914585414585414, "grad_norm": 1.9003297090530396, "learning_rate": 2.1280391457260822e-05, "loss": 1.1143, "step": 15845 }, { "epoch": 0.7917082917082917, "grad_norm": 1.9378173351287842, "learning_rate": 2.1254905958509607e-05, "loss": 1.2031, "step": 15850 }, { "epoch": 0.791958041958042, "grad_norm": 1.5734570026397705, "learning_rate": 2.1229420459758396e-05, "loss": 1.0044, "step": 15855 }, { "epoch": 0.7922077922077922, "grad_norm": 1.6940454244613647, "learning_rate": 2.120393496100719e-05, "loss": 0.9928, "step": 15860 }, { "epoch": 0.7924575424575424, "grad_norm": 2.47194242477417, "learning_rate": 2.1178449462255977e-05, "loss": 0.9751, "step": 15865 }, { "epoch": 0.7927072927072927, "grad_norm": 1.9862936735153198, "learning_rate": 2.1152963963504766e-05, "loss": 0.8541, "step": 15870 }, { "epoch": 0.792957042957043, "grad_norm": 1.9593781232833862, "learning_rate": 2.1127478464753555e-05, "loss": 1.0234, "step": 15875 }, { "epoch": 0.7932067932067932, "grad_norm": 1.622632622718811, "learning_rate": 2.1101992966002347e-05, "loss": 1.0172, "step": 15880 }, { "epoch": 0.7934565434565435, "grad_norm": 1.9372820854187012, "learning_rate": 2.1076507467251136e-05, "loss": 1.049, "step": 15885 }, { "epoch": 0.7937062937062938, "grad_norm": 2.056973457336426, "learning_rate": 2.1051021968499925e-05, "loss": 1.0589, "step": 15890 }, { "epoch": 0.7939560439560439, "grad_norm": 1.9507545232772827, "learning_rate": 2.1025536469748713e-05, "loss": 0.9432, "step": 15895 }, { "epoch": 0.7942057942057942, "grad_norm": 1.9181671142578125, "learning_rate": 2.1000050970997502e-05, "loss": 0.9931, "step": 15900 }, { "epoch": 0.7944555444555444, "grad_norm": 1.7086602449417114, "learning_rate": 2.0974565472246294e-05, "loss": 1.0894, "step": 15905 }, { "epoch": 0.7947052947052947, "grad_norm": 1.7741049528121948, "learning_rate": 2.094907997349508e-05, "loss": 0.9637, "step": 15910 }, { "epoch": 0.794955044955045, "grad_norm": 1.631587266921997, "learning_rate": 2.0923594474743872e-05, "loss": 1.0115, "step": 15915 }, { "epoch": 0.7952047952047953, "grad_norm": 1.792569637298584, "learning_rate": 2.089810897599266e-05, "loss": 1.0026, "step": 15920 }, { "epoch": 0.7954545454545454, "grad_norm": 1.7319871187210083, "learning_rate": 2.0872623477241453e-05, "loss": 0.9582, "step": 15925 }, { "epoch": 0.7957042957042957, "grad_norm": 2.397439956665039, "learning_rate": 2.0847137978490238e-05, "loss": 0.9809, "step": 15930 }, { "epoch": 0.795954045954046, "grad_norm": 1.6196058988571167, "learning_rate": 2.082165247973903e-05, "loss": 0.9703, "step": 15935 }, { "epoch": 0.7962037962037962, "grad_norm": 1.6584025621414185, "learning_rate": 2.079616698098782e-05, "loss": 0.9486, "step": 15940 }, { "epoch": 0.7964535464535465, "grad_norm": 2.0760159492492676, "learning_rate": 2.0770681482236608e-05, "loss": 1.0565, "step": 15945 }, { "epoch": 0.7967032967032966, "grad_norm": 2.156693458557129, "learning_rate": 2.0745195983485397e-05, "loss": 0.9632, "step": 15950 }, { "epoch": 0.7969530469530469, "grad_norm": 1.7453107833862305, "learning_rate": 2.0719710484734186e-05, "loss": 1.0209, "step": 15955 }, { "epoch": 0.7972027972027972, "grad_norm": 1.8151823282241821, "learning_rate": 2.0694224985982978e-05, "loss": 1.0733, "step": 15960 }, { "epoch": 0.7974525474525475, "grad_norm": 2.2190792560577393, "learning_rate": 2.0668739487231767e-05, "loss": 1.0098, "step": 15965 }, { "epoch": 0.7977022977022977, "grad_norm": 2.0152242183685303, "learning_rate": 2.0643253988480555e-05, "loss": 0.9287, "step": 15970 }, { "epoch": 0.797952047952048, "grad_norm": 1.6919852495193481, "learning_rate": 2.0617768489729344e-05, "loss": 1.0167, "step": 15975 }, { "epoch": 0.7982017982017982, "grad_norm": 2.3829612731933594, "learning_rate": 2.0592282990978133e-05, "loss": 1.003, "step": 15980 }, { "epoch": 0.7984515484515484, "grad_norm": 1.8976484537124634, "learning_rate": 2.0566797492226925e-05, "loss": 1.0698, "step": 15985 }, { "epoch": 0.7987012987012987, "grad_norm": 1.798567295074463, "learning_rate": 2.054131199347571e-05, "loss": 1.0236, "step": 15990 }, { "epoch": 0.798951048951049, "grad_norm": 1.893679141998291, "learning_rate": 2.0515826494724503e-05, "loss": 0.9986, "step": 15995 }, { "epoch": 0.7992007992007992, "grad_norm": 1.8068119287490845, "learning_rate": 2.049034099597329e-05, "loss": 1.0368, "step": 16000 }, { "epoch": 0.7994505494505495, "grad_norm": 2.041576385498047, "learning_rate": 2.0464855497222084e-05, "loss": 1.0085, "step": 16005 }, { "epoch": 0.7997002997002997, "grad_norm": 2.0429227352142334, "learning_rate": 2.043936999847087e-05, "loss": 1.1562, "step": 16010 }, { "epoch": 0.7999500499500499, "grad_norm": 1.8887410163879395, "learning_rate": 2.041388449971966e-05, "loss": 1.0173, "step": 16015 }, { "epoch": 0.8001998001998002, "grad_norm": 2.8678600788116455, "learning_rate": 2.038839900096845e-05, "loss": 1.0838, "step": 16020 }, { "epoch": 0.8004495504495505, "grad_norm": 1.7786484956741333, "learning_rate": 2.036291350221724e-05, "loss": 0.9792, "step": 16025 }, { "epoch": 0.8006993006993007, "grad_norm": 1.7966163158416748, "learning_rate": 2.0337428003466028e-05, "loss": 1.0106, "step": 16030 }, { "epoch": 0.800949050949051, "grad_norm": 1.6805293560028076, "learning_rate": 2.0311942504714817e-05, "loss": 1.0491, "step": 16035 }, { "epoch": 0.8011988011988012, "grad_norm": 2.1417388916015625, "learning_rate": 2.028645700596361e-05, "loss": 0.9617, "step": 16040 }, { "epoch": 0.8014485514485514, "grad_norm": 1.382874846458435, "learning_rate": 2.0260971507212397e-05, "loss": 0.9958, "step": 16045 }, { "epoch": 0.8016983016983017, "grad_norm": 1.7945033311843872, "learning_rate": 2.0235486008461186e-05, "loss": 1.064, "step": 16050 }, { "epoch": 0.801948051948052, "grad_norm": 1.8991245031356812, "learning_rate": 2.0210000509709975e-05, "loss": 0.9456, "step": 16055 }, { "epoch": 0.8021978021978022, "grad_norm": 1.5894447565078735, "learning_rate": 2.0184515010958764e-05, "loss": 1.0049, "step": 16060 }, { "epoch": 0.8024475524475524, "grad_norm": 2.8216049671173096, "learning_rate": 2.0159029512207556e-05, "loss": 0.9586, "step": 16065 }, { "epoch": 0.8026973026973027, "grad_norm": 2.0833261013031006, "learning_rate": 2.013354401345634e-05, "loss": 1.0822, "step": 16070 }, { "epoch": 0.8029470529470529, "grad_norm": 1.7313663959503174, "learning_rate": 2.0108058514705134e-05, "loss": 1.075, "step": 16075 }, { "epoch": 0.8031968031968032, "grad_norm": 1.5326364040374756, "learning_rate": 2.0082573015953922e-05, "loss": 1.1021, "step": 16080 }, { "epoch": 0.8034465534465535, "grad_norm": 1.6177427768707275, "learning_rate": 2.0057087517202715e-05, "loss": 0.943, "step": 16085 }, { "epoch": 0.8036963036963037, "grad_norm": 1.897523045539856, "learning_rate": 2.00316020184515e-05, "loss": 0.9207, "step": 16090 }, { "epoch": 0.8039460539460539, "grad_norm": 1.7828025817871094, "learning_rate": 2.0006116519700292e-05, "loss": 1.0833, "step": 16095 }, { "epoch": 0.8041958041958042, "grad_norm": 1.8338737487792969, "learning_rate": 1.998063102094908e-05, "loss": 1.0154, "step": 16100 }, { "epoch": 0.8044455544455544, "grad_norm": 2.4374196529388428, "learning_rate": 1.995514552219787e-05, "loss": 1.0064, "step": 16105 }, { "epoch": 0.8046953046953047, "grad_norm": 1.8761818408966064, "learning_rate": 1.992966002344666e-05, "loss": 1.076, "step": 16110 }, { "epoch": 0.804945054945055, "grad_norm": 2.244058132171631, "learning_rate": 1.9904174524695447e-05, "loss": 1.0057, "step": 16115 }, { "epoch": 0.8051948051948052, "grad_norm": 2.1392335891723633, "learning_rate": 1.987868902594424e-05, "loss": 0.9955, "step": 16120 }, { "epoch": 0.8054445554445554, "grad_norm": 2.0646233558654785, "learning_rate": 1.985320352719303e-05, "loss": 0.9964, "step": 16125 }, { "epoch": 0.8056943056943057, "grad_norm": 2.113736867904663, "learning_rate": 1.9827718028441817e-05, "loss": 1.0066, "step": 16130 }, { "epoch": 0.8059440559440559, "grad_norm": 2.3362009525299072, "learning_rate": 1.9802232529690606e-05, "loss": 0.9451, "step": 16135 }, { "epoch": 0.8061938061938062, "grad_norm": 1.4647592306137085, "learning_rate": 1.9776747030939398e-05, "loss": 0.9839, "step": 16140 }, { "epoch": 0.8064435564435565, "grad_norm": 1.6586391925811768, "learning_rate": 1.9751261532188187e-05, "loss": 0.9978, "step": 16145 }, { "epoch": 0.8066933066933067, "grad_norm": 1.7524855136871338, "learning_rate": 1.9725776033436972e-05, "loss": 1.0199, "step": 16150 }, { "epoch": 0.8069430569430569, "grad_norm": 2.587614059448242, "learning_rate": 1.9700290534685764e-05, "loss": 1.0808, "step": 16155 }, { "epoch": 0.8071928071928072, "grad_norm": 1.6828502416610718, "learning_rate": 1.9674805035934553e-05, "loss": 1.0566, "step": 16160 }, { "epoch": 0.8074425574425574, "grad_norm": 2.024415969848633, "learning_rate": 1.9649319537183345e-05, "loss": 0.9197, "step": 16165 }, { "epoch": 0.8076923076923077, "grad_norm": 1.839205026626587, "learning_rate": 1.962383403843213e-05, "loss": 1.0427, "step": 16170 }, { "epoch": 0.807942057942058, "grad_norm": 1.5526607036590576, "learning_rate": 1.9598348539680923e-05, "loss": 1.0162, "step": 16175 }, { "epoch": 0.8081918081918081, "grad_norm": 3.644775152206421, "learning_rate": 1.9572863040929712e-05, "loss": 1.1455, "step": 16180 }, { "epoch": 0.8084415584415584, "grad_norm": 1.6216341257095337, "learning_rate": 1.95473775421785e-05, "loss": 1.067, "step": 16185 }, { "epoch": 0.8086913086913087, "grad_norm": 1.6107637882232666, "learning_rate": 1.952189204342729e-05, "loss": 1.0131, "step": 16190 }, { "epoch": 0.8089410589410589, "grad_norm": 2.2755112648010254, "learning_rate": 1.9496406544676078e-05, "loss": 0.9887, "step": 16195 }, { "epoch": 0.8091908091908092, "grad_norm": 1.3982875347137451, "learning_rate": 1.947092104592487e-05, "loss": 0.9627, "step": 16200 }, { "epoch": 0.8094405594405595, "grad_norm": 1.354275107383728, "learning_rate": 1.944543554717366e-05, "loss": 0.9739, "step": 16205 }, { "epoch": 0.8096903096903096, "grad_norm": 1.8264070749282837, "learning_rate": 1.9419950048422448e-05, "loss": 1.0093, "step": 16210 }, { "epoch": 0.8099400599400599, "grad_norm": 2.2912650108337402, "learning_rate": 1.9394464549671237e-05, "loss": 0.9929, "step": 16215 }, { "epoch": 0.8101898101898102, "grad_norm": 1.5152490139007568, "learning_rate": 1.936897905092003e-05, "loss": 1.0835, "step": 16220 }, { "epoch": 0.8104395604395604, "grad_norm": 2.015958786010742, "learning_rate": 1.9343493552168818e-05, "loss": 0.909, "step": 16225 }, { "epoch": 0.8106893106893107, "grad_norm": 1.9148759841918945, "learning_rate": 1.9318008053417607e-05, "loss": 1.0809, "step": 16230 }, { "epoch": 0.810939060939061, "grad_norm": 1.879286527633667, "learning_rate": 1.9292522554666395e-05, "loss": 1.0597, "step": 16235 }, { "epoch": 0.8111888111888111, "grad_norm": 1.9723687171936035, "learning_rate": 1.9267037055915184e-05, "loss": 0.9522, "step": 16240 }, { "epoch": 0.8114385614385614, "grad_norm": 2.049755573272705, "learning_rate": 1.9241551557163976e-05, "loss": 0.9648, "step": 16245 }, { "epoch": 0.8116883116883117, "grad_norm": 2.6925182342529297, "learning_rate": 1.9216066058412762e-05, "loss": 1.0585, "step": 16250 }, { "epoch": 0.811938061938062, "grad_norm": 1.9860273599624634, "learning_rate": 1.9190580559661554e-05, "loss": 0.9702, "step": 16255 }, { "epoch": 0.8121878121878122, "grad_norm": 2.1059043407440186, "learning_rate": 1.9165095060910343e-05, "loss": 0.9834, "step": 16260 }, { "epoch": 0.8124375624375625, "grad_norm": 1.7308789491653442, "learning_rate": 1.913960956215913e-05, "loss": 1.0399, "step": 16265 }, { "epoch": 0.8126873126873126, "grad_norm": 1.8245267868041992, "learning_rate": 1.911412406340792e-05, "loss": 1.0483, "step": 16270 }, { "epoch": 0.8129370629370629, "grad_norm": 1.8117479085922241, "learning_rate": 1.908863856465671e-05, "loss": 1.0819, "step": 16275 }, { "epoch": 0.8131868131868132, "grad_norm": 2.033541440963745, "learning_rate": 1.90631530659055e-05, "loss": 0.9988, "step": 16280 }, { "epoch": 0.8134365634365635, "grad_norm": 2.2522764205932617, "learning_rate": 1.903766756715429e-05, "loss": 1.0833, "step": 16285 }, { "epoch": 0.8136863136863137, "grad_norm": 1.5695265531539917, "learning_rate": 1.901218206840308e-05, "loss": 0.9353, "step": 16290 }, { "epoch": 0.813936063936064, "grad_norm": 1.484015703201294, "learning_rate": 1.8986696569651868e-05, "loss": 1.0172, "step": 16295 }, { "epoch": 0.8141858141858141, "grad_norm": 1.9969638586044312, "learning_rate": 1.896121107090066e-05, "loss": 1.0408, "step": 16300 }, { "epoch": 0.8144355644355644, "grad_norm": 2.361368179321289, "learning_rate": 1.893572557214945e-05, "loss": 0.9451, "step": 16305 }, { "epoch": 0.8146853146853147, "grad_norm": 3.4274418354034424, "learning_rate": 1.8910240073398237e-05, "loss": 1.0227, "step": 16310 }, { "epoch": 0.814935064935065, "grad_norm": 1.7225866317749023, "learning_rate": 1.8884754574647026e-05, "loss": 1.1472, "step": 16315 }, { "epoch": 0.8151848151848152, "grad_norm": 1.7529537677764893, "learning_rate": 1.8859269075895815e-05, "loss": 0.9999, "step": 16320 }, { "epoch": 0.8154345654345654, "grad_norm": 1.6567771434783936, "learning_rate": 1.8833783577144607e-05, "loss": 1.0586, "step": 16325 }, { "epoch": 0.8156843156843157, "grad_norm": 1.6569197177886963, "learning_rate": 1.8808298078393393e-05, "loss": 1.0276, "step": 16330 }, { "epoch": 0.8159340659340659, "grad_norm": 1.3483885526657104, "learning_rate": 1.8782812579642185e-05, "loss": 0.99, "step": 16335 }, { "epoch": 0.8161838161838162, "grad_norm": 1.765367031097412, "learning_rate": 1.8757327080890974e-05, "loss": 1.0281, "step": 16340 }, { "epoch": 0.8164335664335665, "grad_norm": 1.845678687095642, "learning_rate": 1.8731841582139766e-05, "loss": 0.9811, "step": 16345 }, { "epoch": 0.8166833166833167, "grad_norm": 2.3208084106445312, "learning_rate": 1.870635608338855e-05, "loss": 1.0558, "step": 16350 }, { "epoch": 0.8169330669330669, "grad_norm": 2.0484189987182617, "learning_rate": 1.868087058463734e-05, "loss": 1.0673, "step": 16355 }, { "epoch": 0.8171828171828172, "grad_norm": 1.9874671697616577, "learning_rate": 1.8655385085886132e-05, "loss": 1.0837, "step": 16360 }, { "epoch": 0.8174325674325674, "grad_norm": 1.866633415222168, "learning_rate": 1.862989958713492e-05, "loss": 1.052, "step": 16365 }, { "epoch": 0.8176823176823177, "grad_norm": 1.8379453420639038, "learning_rate": 1.860441408838371e-05, "loss": 1.0149, "step": 16370 }, { "epoch": 0.817932067932068, "grad_norm": 1.6842352151870728, "learning_rate": 1.85789285896325e-05, "loss": 0.9538, "step": 16375 }, { "epoch": 0.8181818181818182, "grad_norm": 1.8362879753112793, "learning_rate": 1.855344309088129e-05, "loss": 0.954, "step": 16380 }, { "epoch": 0.8184315684315684, "grad_norm": 2.326918840408325, "learning_rate": 1.852795759213008e-05, "loss": 1.0448, "step": 16385 }, { "epoch": 0.8186813186813187, "grad_norm": 1.824901819229126, "learning_rate": 1.8502472093378868e-05, "loss": 0.9836, "step": 16390 }, { "epoch": 0.8189310689310689, "grad_norm": 2.13266658782959, "learning_rate": 1.8476986594627657e-05, "loss": 1.0095, "step": 16395 }, { "epoch": 0.8191808191808192, "grad_norm": 2.0381312370300293, "learning_rate": 1.8451501095876446e-05, "loss": 1.0003, "step": 16400 }, { "epoch": 0.8194305694305695, "grad_norm": 2.0931103229522705, "learning_rate": 1.8426015597125238e-05, "loss": 1.0229, "step": 16405 }, { "epoch": 0.8196803196803197, "grad_norm": 2.7269723415374756, "learning_rate": 1.8400530098374023e-05, "loss": 1.0525, "step": 16410 }, { "epoch": 0.8199300699300699, "grad_norm": 2.4387447834014893, "learning_rate": 1.8375044599622816e-05, "loss": 0.9385, "step": 16415 }, { "epoch": 0.8201798201798202, "grad_norm": 1.619065523147583, "learning_rate": 1.8349559100871604e-05, "loss": 0.9663, "step": 16420 }, { "epoch": 0.8204295704295704, "grad_norm": 1.8362390995025635, "learning_rate": 1.8324073602120397e-05, "loss": 1.0405, "step": 16425 }, { "epoch": 0.8206793206793207, "grad_norm": 1.7563846111297607, "learning_rate": 1.8298588103369182e-05, "loss": 1.044, "step": 16430 }, { "epoch": 0.820929070929071, "grad_norm": 2.1355466842651367, "learning_rate": 1.8273102604617974e-05, "loss": 1.0553, "step": 16435 }, { "epoch": 0.8211788211788211, "grad_norm": 1.5947548151016235, "learning_rate": 1.8247617105866763e-05, "loss": 1.0833, "step": 16440 }, { "epoch": 0.8214285714285714, "grad_norm": 1.8985258340835571, "learning_rate": 1.8222131607115552e-05, "loss": 1.0375, "step": 16445 }, { "epoch": 0.8216783216783217, "grad_norm": 2.1348514556884766, "learning_rate": 1.819664610836434e-05, "loss": 1.0268, "step": 16450 }, { "epoch": 0.8219280719280719, "grad_norm": 1.9404008388519287, "learning_rate": 1.817116060961313e-05, "loss": 1.0711, "step": 16455 }, { "epoch": 0.8221778221778222, "grad_norm": 1.750197410583496, "learning_rate": 1.814567511086192e-05, "loss": 1.0396, "step": 16460 }, { "epoch": 0.8224275724275725, "grad_norm": 2.296485185623169, "learning_rate": 1.812018961211071e-05, "loss": 1.041, "step": 16465 }, { "epoch": 0.8226773226773226, "grad_norm": 1.6547578573226929, "learning_rate": 1.80947041133595e-05, "loss": 0.9973, "step": 16470 }, { "epoch": 0.8229270729270729, "grad_norm": 1.9422831535339355, "learning_rate": 1.8069218614608288e-05, "loss": 1.0727, "step": 16475 }, { "epoch": 0.8231768231768232, "grad_norm": 1.471479058265686, "learning_rate": 1.8043733115857077e-05, "loss": 0.9874, "step": 16480 }, { "epoch": 0.8234265734265734, "grad_norm": 2.0190298557281494, "learning_rate": 1.801824761710587e-05, "loss": 1.0588, "step": 16485 }, { "epoch": 0.8236763236763237, "grad_norm": 2.03623628616333, "learning_rate": 1.7992762118354654e-05, "loss": 1.0634, "step": 16490 }, { "epoch": 0.823926073926074, "grad_norm": 1.8134379386901855, "learning_rate": 1.7967276619603447e-05, "loss": 1.1629, "step": 16495 }, { "epoch": 0.8241758241758241, "grad_norm": 1.712688684463501, "learning_rate": 1.7941791120852235e-05, "loss": 1.0469, "step": 16500 }, { "epoch": 0.8244255744255744, "grad_norm": 1.3346401453018188, "learning_rate": 1.7916305622101027e-05, "loss": 1.0046, "step": 16505 }, { "epoch": 0.8246753246753247, "grad_norm": 1.9102833271026611, "learning_rate": 1.7890820123349813e-05, "loss": 1.0655, "step": 16510 }, { "epoch": 0.8249250749250749, "grad_norm": 1.813653588294983, "learning_rate": 1.7865334624598605e-05, "loss": 1.027, "step": 16515 }, { "epoch": 0.8251748251748252, "grad_norm": 1.755455493927002, "learning_rate": 1.7839849125847394e-05, "loss": 1.0628, "step": 16520 }, { "epoch": 0.8254245754245755, "grad_norm": 2.11082124710083, "learning_rate": 1.7814363627096183e-05, "loss": 0.9851, "step": 16525 }, { "epoch": 0.8256743256743256, "grad_norm": 2.9195637702941895, "learning_rate": 1.778887812834497e-05, "loss": 1.1099, "step": 16530 }, { "epoch": 0.8259240759240759, "grad_norm": 1.736777663230896, "learning_rate": 1.776339262959376e-05, "loss": 1.012, "step": 16535 }, { "epoch": 0.8261738261738262, "grad_norm": 1.8526850938796997, "learning_rate": 1.7737907130842552e-05, "loss": 1.0293, "step": 16540 }, { "epoch": 0.8264235764235764, "grad_norm": 1.6853814125061035, "learning_rate": 1.771242163209134e-05, "loss": 1.0369, "step": 16545 }, { "epoch": 0.8266733266733267, "grad_norm": 2.656830072402954, "learning_rate": 1.768693613334013e-05, "loss": 1.0962, "step": 16550 }, { "epoch": 0.8269230769230769, "grad_norm": 1.5451220273971558, "learning_rate": 1.766145063458892e-05, "loss": 1.0805, "step": 16555 }, { "epoch": 0.8271728271728271, "grad_norm": 2.0584936141967773, "learning_rate": 1.7635965135837708e-05, "loss": 0.9635, "step": 16560 }, { "epoch": 0.8274225774225774, "grad_norm": 2.6665163040161133, "learning_rate": 1.76104796370865e-05, "loss": 1.0079, "step": 16565 }, { "epoch": 0.8276723276723277, "grad_norm": 1.7817637920379639, "learning_rate": 1.7584994138335285e-05, "loss": 1.0105, "step": 16570 }, { "epoch": 0.827922077922078, "grad_norm": 2.0806615352630615, "learning_rate": 1.7559508639584077e-05, "loss": 1.0634, "step": 16575 }, { "epoch": 0.8281718281718282, "grad_norm": 1.8902524709701538, "learning_rate": 1.7534023140832866e-05, "loss": 1.0338, "step": 16580 }, { "epoch": 0.8284215784215784, "grad_norm": 2.654445171356201, "learning_rate": 1.750853764208166e-05, "loss": 0.9606, "step": 16585 }, { "epoch": 0.8286713286713286, "grad_norm": 1.7926093339920044, "learning_rate": 1.7483052143330444e-05, "loss": 1.0956, "step": 16590 }, { "epoch": 0.8289210789210789, "grad_norm": 2.247774362564087, "learning_rate": 1.7457566644579236e-05, "loss": 0.8975, "step": 16595 }, { "epoch": 0.8291708291708292, "grad_norm": 3.2486302852630615, "learning_rate": 1.7432081145828025e-05, "loss": 1.066, "step": 16600 }, { "epoch": 0.8294205794205795, "grad_norm": 1.603468894958496, "learning_rate": 1.7406595647076814e-05, "loss": 1.0503, "step": 16605 }, { "epoch": 0.8296703296703297, "grad_norm": 1.9915220737457275, "learning_rate": 1.7381110148325602e-05, "loss": 0.8826, "step": 16610 }, { "epoch": 0.8299200799200799, "grad_norm": 1.7421189546585083, "learning_rate": 1.735562464957439e-05, "loss": 0.9644, "step": 16615 }, { "epoch": 0.8301698301698301, "grad_norm": 2.2296559810638428, "learning_rate": 1.7330139150823183e-05, "loss": 1.0375, "step": 16620 }, { "epoch": 0.8304195804195804, "grad_norm": 1.8005317449569702, "learning_rate": 1.7304653652071972e-05, "loss": 1.0304, "step": 16625 }, { "epoch": 0.8306693306693307, "grad_norm": 2.4719784259796143, "learning_rate": 1.727916815332076e-05, "loss": 1.0372, "step": 16630 }, { "epoch": 0.830919080919081, "grad_norm": 1.8428573608398438, "learning_rate": 1.725368265456955e-05, "loss": 0.9997, "step": 16635 }, { "epoch": 0.8311688311688312, "grad_norm": 1.7237192392349243, "learning_rate": 1.7228197155818342e-05, "loss": 1.005, "step": 16640 }, { "epoch": 0.8314185814185814, "grad_norm": 1.992737054824829, "learning_rate": 1.720271165706713e-05, "loss": 1.0471, "step": 16645 }, { "epoch": 0.8316683316683317, "grad_norm": 2.3027689456939697, "learning_rate": 1.717722615831592e-05, "loss": 0.9996, "step": 16650 }, { "epoch": 0.8319180819180819, "grad_norm": 1.6896758079528809, "learning_rate": 1.7151740659564708e-05, "loss": 1.0799, "step": 16655 }, { "epoch": 0.8321678321678322, "grad_norm": 2.1603260040283203, "learning_rate": 1.7126255160813497e-05, "loss": 1.0595, "step": 16660 }, { "epoch": 0.8324175824175825, "grad_norm": 1.7082576751708984, "learning_rate": 1.710076966206229e-05, "loss": 1.0916, "step": 16665 }, { "epoch": 0.8326673326673326, "grad_norm": 1.8712321519851685, "learning_rate": 1.7075284163311075e-05, "loss": 0.9091, "step": 16670 }, { "epoch": 0.8329170829170829, "grad_norm": 2.7003233432769775, "learning_rate": 1.7049798664559867e-05, "loss": 0.9654, "step": 16675 }, { "epoch": 0.8331668331668332, "grad_norm": 1.9339138269424438, "learning_rate": 1.7024313165808656e-05, "loss": 1.0393, "step": 16680 }, { "epoch": 0.8334165834165834, "grad_norm": 1.7652283906936646, "learning_rate": 1.6998827667057444e-05, "loss": 0.9471, "step": 16685 }, { "epoch": 0.8336663336663337, "grad_norm": 3.767974376678467, "learning_rate": 1.6973342168306233e-05, "loss": 1.0817, "step": 16690 }, { "epoch": 0.833916083916084, "grad_norm": 1.7147862911224365, "learning_rate": 1.6947856669555022e-05, "loss": 0.9357, "step": 16695 }, { "epoch": 0.8341658341658341, "grad_norm": 1.6620596647262573, "learning_rate": 1.6922371170803814e-05, "loss": 1.0119, "step": 16700 }, { "epoch": 0.8344155844155844, "grad_norm": 1.67545747756958, "learning_rate": 1.6896885672052603e-05, "loss": 0.9604, "step": 16705 }, { "epoch": 0.8346653346653347, "grad_norm": 1.6696892976760864, "learning_rate": 1.6871400173301392e-05, "loss": 0.9645, "step": 16710 }, { "epoch": 0.8349150849150849, "grad_norm": 1.4201419353485107, "learning_rate": 1.684591467455018e-05, "loss": 0.9698, "step": 16715 }, { "epoch": 0.8351648351648352, "grad_norm": 1.4580844640731812, "learning_rate": 1.6820429175798973e-05, "loss": 1.0224, "step": 16720 }, { "epoch": 0.8354145854145855, "grad_norm": 1.4426074028015137, "learning_rate": 1.679494367704776e-05, "loss": 0.962, "step": 16725 }, { "epoch": 0.8356643356643356, "grad_norm": 1.6445611715316772, "learning_rate": 1.676945817829655e-05, "loss": 0.912, "step": 16730 }, { "epoch": 0.8359140859140859, "grad_norm": 1.9365720748901367, "learning_rate": 1.674397267954534e-05, "loss": 1.0056, "step": 16735 }, { "epoch": 0.8361638361638362, "grad_norm": 2.157348871231079, "learning_rate": 1.6718487180794128e-05, "loss": 1.056, "step": 16740 }, { "epoch": 0.8364135864135864, "grad_norm": 1.8339561223983765, "learning_rate": 1.669300168204292e-05, "loss": 1.0254, "step": 16745 }, { "epoch": 0.8366633366633367, "grad_norm": 1.836093544960022, "learning_rate": 1.6667516183291705e-05, "loss": 0.9678, "step": 16750 }, { "epoch": 0.836913086913087, "grad_norm": 1.8950530290603638, "learning_rate": 1.6642030684540498e-05, "loss": 1.0384, "step": 16755 }, { "epoch": 0.8371628371628371, "grad_norm": 1.8085708618164062, "learning_rate": 1.6616545185789286e-05, "loss": 1.0004, "step": 16760 }, { "epoch": 0.8374125874125874, "grad_norm": 1.9204152822494507, "learning_rate": 1.6591059687038075e-05, "loss": 1.0322, "step": 16765 }, { "epoch": 0.8376623376623377, "grad_norm": 1.582283854484558, "learning_rate": 1.6565574188286864e-05, "loss": 0.9618, "step": 16770 }, { "epoch": 0.8379120879120879, "grad_norm": 1.751755952835083, "learning_rate": 1.6540088689535653e-05, "loss": 1.0228, "step": 16775 }, { "epoch": 0.8381618381618382, "grad_norm": 1.5449683666229248, "learning_rate": 1.6514603190784445e-05, "loss": 0.9155, "step": 16780 }, { "epoch": 0.8384115884115884, "grad_norm": 1.4771562814712524, "learning_rate": 1.6489117692033234e-05, "loss": 0.8857, "step": 16785 }, { "epoch": 0.8386613386613386, "grad_norm": 4.676975727081299, "learning_rate": 1.6463632193282023e-05, "loss": 0.9586, "step": 16790 }, { "epoch": 0.8389110889110889, "grad_norm": 1.6731184720993042, "learning_rate": 1.643814669453081e-05, "loss": 1.1599, "step": 16795 }, { "epoch": 0.8391608391608392, "grad_norm": 1.569836139678955, "learning_rate": 1.6412661195779604e-05, "loss": 0.9715, "step": 16800 }, { "epoch": 0.8394105894105894, "grad_norm": 2.0505669116973877, "learning_rate": 1.6387175697028392e-05, "loss": 1.1189, "step": 16805 }, { "epoch": 0.8396603396603397, "grad_norm": 2.840055227279663, "learning_rate": 1.636169019827718e-05, "loss": 0.9049, "step": 16810 }, { "epoch": 0.8399100899100899, "grad_norm": 1.9198589324951172, "learning_rate": 1.633620469952597e-05, "loss": 1.0566, "step": 16815 }, { "epoch": 0.8401598401598401, "grad_norm": 2.596996784210205, "learning_rate": 1.631071920077476e-05, "loss": 1.0205, "step": 16820 }, { "epoch": 0.8404095904095904, "grad_norm": 2.606471538543701, "learning_rate": 1.628523370202355e-05, "loss": 1.0684, "step": 16825 }, { "epoch": 0.8406593406593407, "grad_norm": 1.9520634412765503, "learning_rate": 1.6259748203272336e-05, "loss": 1.0679, "step": 16830 }, { "epoch": 0.8409090909090909, "grad_norm": 1.670035481452942, "learning_rate": 1.623426270452113e-05, "loss": 0.9448, "step": 16835 }, { "epoch": 0.8411588411588412, "grad_norm": 1.6297519207000732, "learning_rate": 1.6208777205769917e-05, "loss": 1.0197, "step": 16840 }, { "epoch": 0.8414085914085914, "grad_norm": 1.8443796634674072, "learning_rate": 1.618329170701871e-05, "loss": 1.0554, "step": 16845 }, { "epoch": 0.8416583416583416, "grad_norm": 1.5690349340438843, "learning_rate": 1.6157806208267495e-05, "loss": 1.0659, "step": 16850 }, { "epoch": 0.8419080919080919, "grad_norm": 1.8457492589950562, "learning_rate": 1.6132320709516287e-05, "loss": 0.9802, "step": 16855 }, { "epoch": 0.8421578421578422, "grad_norm": 1.6740140914916992, "learning_rate": 1.6106835210765076e-05, "loss": 0.9963, "step": 16860 }, { "epoch": 0.8424075924075924, "grad_norm": 1.5507893562316895, "learning_rate": 1.6081349712013865e-05, "loss": 0.9968, "step": 16865 }, { "epoch": 0.8426573426573427, "grad_norm": 1.6649622917175293, "learning_rate": 1.6055864213262653e-05, "loss": 1.0325, "step": 16870 }, { "epoch": 0.8429070929070929, "grad_norm": 3.046107053756714, "learning_rate": 1.6030378714511442e-05, "loss": 0.9554, "step": 16875 }, { "epoch": 0.8431568431568431, "grad_norm": 1.940637469291687, "learning_rate": 1.6004893215760234e-05, "loss": 1.0428, "step": 16880 }, { "epoch": 0.8434065934065934, "grad_norm": 1.6635692119598389, "learning_rate": 1.5979407717009023e-05, "loss": 1.0601, "step": 16885 }, { "epoch": 0.8436563436563437, "grad_norm": 2.247532367706299, "learning_rate": 1.5953922218257812e-05, "loss": 1.0679, "step": 16890 }, { "epoch": 0.843906093906094, "grad_norm": 3.5479092597961426, "learning_rate": 1.59284367195066e-05, "loss": 1.0119, "step": 16895 }, { "epoch": 0.8441558441558441, "grad_norm": 2.633639097213745, "learning_rate": 1.590295122075539e-05, "loss": 0.9842, "step": 16900 }, { "epoch": 0.8444055944055944, "grad_norm": 2.0455236434936523, "learning_rate": 1.5877465722004182e-05, "loss": 0.9031, "step": 16905 }, { "epoch": 0.8446553446553446, "grad_norm": 2.2022545337677, "learning_rate": 1.5851980223252967e-05, "loss": 1.1078, "step": 16910 }, { "epoch": 0.8449050949050949, "grad_norm": 1.6922475099563599, "learning_rate": 1.582649472450176e-05, "loss": 1.0708, "step": 16915 }, { "epoch": 0.8451548451548452, "grad_norm": 2.0287370681762695, "learning_rate": 1.5801009225750548e-05, "loss": 1.0665, "step": 16920 }, { "epoch": 0.8454045954045954, "grad_norm": 2.2619705200195312, "learning_rate": 1.577552372699934e-05, "loss": 0.9378, "step": 16925 }, { "epoch": 0.8456543456543456, "grad_norm": 1.6781498193740845, "learning_rate": 1.5750038228248126e-05, "loss": 1.0338, "step": 16930 }, { "epoch": 0.8459040959040959, "grad_norm": 3.1359751224517822, "learning_rate": 1.5724552729496918e-05, "loss": 1.0077, "step": 16935 }, { "epoch": 0.8461538461538461, "grad_norm": 2.0720598697662354, "learning_rate": 1.5699067230745707e-05, "loss": 1.024, "step": 16940 }, { "epoch": 0.8464035964035964, "grad_norm": 2.1837546825408936, "learning_rate": 1.5673581731994496e-05, "loss": 1.0797, "step": 16945 }, { "epoch": 0.8466533466533467, "grad_norm": 1.606764316558838, "learning_rate": 1.5648096233243284e-05, "loss": 1.0346, "step": 16950 }, { "epoch": 0.846903096903097, "grad_norm": 2.662525177001953, "learning_rate": 1.5622610734492073e-05, "loss": 0.987, "step": 16955 }, { "epoch": 0.8471528471528471, "grad_norm": 1.8253047466278076, "learning_rate": 1.5597125235740865e-05, "loss": 1.0834, "step": 16960 }, { "epoch": 0.8474025974025974, "grad_norm": 1.6476891040802002, "learning_rate": 1.5571639736989654e-05, "loss": 1.0514, "step": 16965 }, { "epoch": 0.8476523476523476, "grad_norm": 1.7799973487854004, "learning_rate": 1.5546154238238443e-05, "loss": 1.0809, "step": 16970 }, { "epoch": 0.8479020979020979, "grad_norm": 1.6737701892852783, "learning_rate": 1.552066873948723e-05, "loss": 1.0132, "step": 16975 }, { "epoch": 0.8481518481518482, "grad_norm": 1.7446956634521484, "learning_rate": 1.549518324073602e-05, "loss": 0.9172, "step": 16980 }, { "epoch": 0.8484015984015985, "grad_norm": 2.2451064586639404, "learning_rate": 1.5469697741984813e-05, "loss": 1.0138, "step": 16985 }, { "epoch": 0.8486513486513486, "grad_norm": 2.1713593006134033, "learning_rate": 1.5444212243233598e-05, "loss": 1.115, "step": 16990 }, { "epoch": 0.8489010989010989, "grad_norm": 2.4247050285339355, "learning_rate": 1.541872674448239e-05, "loss": 1.0006, "step": 16995 }, { "epoch": 0.8491508491508492, "grad_norm": 1.7015314102172852, "learning_rate": 1.539324124573118e-05, "loss": 0.9538, "step": 17000 }, { "epoch": 0.8494005994005994, "grad_norm": 1.9837943315505981, "learning_rate": 1.536775574697997e-05, "loss": 1.0368, "step": 17005 }, { "epoch": 0.8496503496503497, "grad_norm": 1.8093103170394897, "learning_rate": 1.5342270248228757e-05, "loss": 1.061, "step": 17010 }, { "epoch": 0.8499000999000998, "grad_norm": 1.904503583908081, "learning_rate": 1.531678474947755e-05, "loss": 0.932, "step": 17015 }, { "epoch": 0.8501498501498501, "grad_norm": 1.7908234596252441, "learning_rate": 1.5291299250726338e-05, "loss": 0.9912, "step": 17020 }, { "epoch": 0.8503996003996004, "grad_norm": 1.5041658878326416, "learning_rate": 1.5265813751975126e-05, "loss": 0.957, "step": 17025 }, { "epoch": 0.8506493506493507, "grad_norm": 1.978719711303711, "learning_rate": 1.5240328253223915e-05, "loss": 0.9626, "step": 17030 }, { "epoch": 0.8508991008991009, "grad_norm": 1.9930400848388672, "learning_rate": 1.5214842754472706e-05, "loss": 0.9515, "step": 17035 }, { "epoch": 0.8511488511488512, "grad_norm": 1.8856816291809082, "learning_rate": 1.5189357255721494e-05, "loss": 0.9805, "step": 17040 }, { "epoch": 0.8513986013986014, "grad_norm": 1.3716100454330444, "learning_rate": 1.5163871756970285e-05, "loss": 1.0447, "step": 17045 }, { "epoch": 0.8516483516483516, "grad_norm": 1.9595167636871338, "learning_rate": 1.5138386258219072e-05, "loss": 0.9485, "step": 17050 }, { "epoch": 0.8518981018981019, "grad_norm": 1.7601492404937744, "learning_rate": 1.5112900759467863e-05, "loss": 0.9375, "step": 17055 }, { "epoch": 0.8521478521478522, "grad_norm": 1.4522981643676758, "learning_rate": 1.5087415260716653e-05, "loss": 0.8979, "step": 17060 }, { "epoch": 0.8523976023976024, "grad_norm": 1.8892203569412231, "learning_rate": 1.5061929761965444e-05, "loss": 1.1184, "step": 17065 }, { "epoch": 0.8526473526473527, "grad_norm": 1.9771981239318848, "learning_rate": 1.503644426321423e-05, "loss": 0.9251, "step": 17070 }, { "epoch": 0.8528971028971029, "grad_norm": 1.902441143989563, "learning_rate": 1.5010958764463021e-05, "loss": 0.9891, "step": 17075 }, { "epoch": 0.8531468531468531, "grad_norm": 2.597015619277954, "learning_rate": 1.4985473265711812e-05, "loss": 1.0097, "step": 17080 }, { "epoch": 0.8533966033966034, "grad_norm": 1.5347827672958374, "learning_rate": 1.49599877669606e-05, "loss": 0.9836, "step": 17085 }, { "epoch": 0.8536463536463537, "grad_norm": 1.718320608139038, "learning_rate": 1.4934502268209387e-05, "loss": 1.0112, "step": 17090 }, { "epoch": 0.8538961038961039, "grad_norm": 2.4177908897399902, "learning_rate": 1.4909016769458178e-05, "loss": 1.0341, "step": 17095 }, { "epoch": 0.8541458541458542, "grad_norm": 2.2081170082092285, "learning_rate": 1.4883531270706968e-05, "loss": 0.9838, "step": 17100 }, { "epoch": 0.8543956043956044, "grad_norm": 1.7391544580459595, "learning_rate": 1.4858045771955759e-05, "loss": 0.8853, "step": 17105 }, { "epoch": 0.8546453546453546, "grad_norm": 1.5422165393829346, "learning_rate": 1.4832560273204546e-05, "loss": 1.0264, "step": 17110 }, { "epoch": 0.8548951048951049, "grad_norm": 2.018001079559326, "learning_rate": 1.4807074774453337e-05, "loss": 0.9896, "step": 17115 }, { "epoch": 0.8551448551448552, "grad_norm": 1.6138808727264404, "learning_rate": 1.4781589275702127e-05, "loss": 1.0742, "step": 17120 }, { "epoch": 0.8553946053946054, "grad_norm": 1.863389015197754, "learning_rate": 1.4756103776950916e-05, "loss": 1.0549, "step": 17125 }, { "epoch": 0.8556443556443556, "grad_norm": 1.852931022644043, "learning_rate": 1.4730618278199703e-05, "loss": 1.065, "step": 17130 }, { "epoch": 0.8558941058941059, "grad_norm": 1.7187727689743042, "learning_rate": 1.4705132779448493e-05, "loss": 1.0154, "step": 17135 }, { "epoch": 0.8561438561438561, "grad_norm": 1.719221591949463, "learning_rate": 1.4679647280697284e-05, "loss": 1.0513, "step": 17140 }, { "epoch": 0.8563936063936064, "grad_norm": 2.2141828536987305, "learning_rate": 1.4654161781946074e-05, "loss": 0.9621, "step": 17145 }, { "epoch": 0.8566433566433567, "grad_norm": 1.8306005001068115, "learning_rate": 1.4628676283194861e-05, "loss": 1.076, "step": 17150 }, { "epoch": 0.8568931068931069, "grad_norm": 2.22892427444458, "learning_rate": 1.4603190784443652e-05, "loss": 1.025, "step": 17155 }, { "epoch": 0.8571428571428571, "grad_norm": 1.8869359493255615, "learning_rate": 1.4577705285692442e-05, "loss": 1.02, "step": 17160 }, { "epoch": 0.8573926073926074, "grad_norm": 1.8768341541290283, "learning_rate": 1.4552219786941231e-05, "loss": 0.9927, "step": 17165 }, { "epoch": 0.8576423576423576, "grad_norm": 1.7526390552520752, "learning_rate": 1.452673428819002e-05, "loss": 0.939, "step": 17170 }, { "epoch": 0.8578921078921079, "grad_norm": 1.7864093780517578, "learning_rate": 1.4501248789438809e-05, "loss": 1.0764, "step": 17175 }, { "epoch": 0.8581418581418582, "grad_norm": 1.8163315057754517, "learning_rate": 1.44757632906876e-05, "loss": 1.0682, "step": 17180 }, { "epoch": 0.8583916083916084, "grad_norm": 1.7926602363586426, "learning_rate": 1.445027779193639e-05, "loss": 1.0599, "step": 17185 }, { "epoch": 0.8586413586413586, "grad_norm": 1.8079785108566284, "learning_rate": 1.4424792293185177e-05, "loss": 0.9185, "step": 17190 }, { "epoch": 0.8588911088911089, "grad_norm": 2.019160747528076, "learning_rate": 1.4399306794433967e-05, "loss": 0.9354, "step": 17195 }, { "epoch": 0.8591408591408591, "grad_norm": 2.0943005084991455, "learning_rate": 1.4373821295682758e-05, "loss": 1.0091, "step": 17200 }, { "epoch": 0.8593906093906094, "grad_norm": 1.9846165180206299, "learning_rate": 1.4348335796931547e-05, "loss": 1.0091, "step": 17205 }, { "epoch": 0.8596403596403597, "grad_norm": 1.7512320280075073, "learning_rate": 1.4322850298180335e-05, "loss": 0.9962, "step": 17210 }, { "epoch": 0.8598901098901099, "grad_norm": 2.218754291534424, "learning_rate": 1.4297364799429124e-05, "loss": 1.015, "step": 17215 }, { "epoch": 0.8601398601398601, "grad_norm": 1.7318496704101562, "learning_rate": 1.4271879300677915e-05, "loss": 0.9703, "step": 17220 }, { "epoch": 0.8603896103896104, "grad_norm": 1.9032024145126343, "learning_rate": 1.4246393801926705e-05, "loss": 0.9705, "step": 17225 }, { "epoch": 0.8606393606393606, "grad_norm": 2.945817232131958, "learning_rate": 1.4220908303175492e-05, "loss": 1.0647, "step": 17230 }, { "epoch": 0.8608891108891109, "grad_norm": 1.8639159202575684, "learning_rate": 1.4195422804424283e-05, "loss": 0.9856, "step": 17235 }, { "epoch": 0.8611388611388612, "grad_norm": 1.8537933826446533, "learning_rate": 1.4169937305673073e-05, "loss": 0.9602, "step": 17240 }, { "epoch": 0.8613886113886113, "grad_norm": 1.5963232517242432, "learning_rate": 1.4144451806921862e-05, "loss": 0.9857, "step": 17245 }, { "epoch": 0.8616383616383616, "grad_norm": 2.090930223464966, "learning_rate": 1.4118966308170651e-05, "loss": 1.027, "step": 17250 }, { "epoch": 0.8618881118881119, "grad_norm": 2.9146783351898193, "learning_rate": 1.409348080941944e-05, "loss": 1.1338, "step": 17255 }, { "epoch": 0.8621378621378621, "grad_norm": 1.5964157581329346, "learning_rate": 1.406799531066823e-05, "loss": 0.9499, "step": 17260 }, { "epoch": 0.8623876123876124, "grad_norm": 2.1536428928375244, "learning_rate": 1.404250981191702e-05, "loss": 1.0823, "step": 17265 }, { "epoch": 0.8626373626373627, "grad_norm": 1.9103713035583496, "learning_rate": 1.4017024313165808e-05, "loss": 1.0298, "step": 17270 }, { "epoch": 0.8628871128871128, "grad_norm": 1.665310263633728, "learning_rate": 1.3991538814414598e-05, "loss": 1.0291, "step": 17275 }, { "epoch": 0.8631368631368631, "grad_norm": 1.759605884552002, "learning_rate": 1.3966053315663389e-05, "loss": 0.982, "step": 17280 }, { "epoch": 0.8633866133866134, "grad_norm": 1.6318140029907227, "learning_rate": 1.394056781691218e-05, "loss": 1.0346, "step": 17285 }, { "epoch": 0.8636363636363636, "grad_norm": 1.9143450260162354, "learning_rate": 1.3915082318160966e-05, "loss": 1.1093, "step": 17290 }, { "epoch": 0.8638861138861139, "grad_norm": 2.166961908340454, "learning_rate": 1.3889596819409755e-05, "loss": 1.0004, "step": 17295 }, { "epoch": 0.8641358641358642, "grad_norm": 2.3383617401123047, "learning_rate": 1.3864111320658546e-05, "loss": 1.0121, "step": 17300 }, { "epoch": 0.8643856143856143, "grad_norm": 2.1761865615844727, "learning_rate": 1.3838625821907336e-05, "loss": 1.0392, "step": 17305 }, { "epoch": 0.8646353646353646, "grad_norm": 1.8706380128860474, "learning_rate": 1.3813140323156123e-05, "loss": 1.0417, "step": 17310 }, { "epoch": 0.8648851148851149, "grad_norm": 2.1875288486480713, "learning_rate": 1.3787654824404914e-05, "loss": 1.1167, "step": 17315 }, { "epoch": 0.8651348651348651, "grad_norm": 1.526767611503601, "learning_rate": 1.3762169325653704e-05, "loss": 0.9468, "step": 17320 }, { "epoch": 0.8653846153846154, "grad_norm": 2.1448304653167725, "learning_rate": 1.3736683826902495e-05, "loss": 0.919, "step": 17325 }, { "epoch": 0.8656343656343657, "grad_norm": 1.7803351879119873, "learning_rate": 1.3711198328151282e-05, "loss": 1.0473, "step": 17330 }, { "epoch": 0.8658841158841158, "grad_norm": 1.7067418098449707, "learning_rate": 1.368571282940007e-05, "loss": 0.9806, "step": 17335 }, { "epoch": 0.8661338661338661, "grad_norm": 2.082343101501465, "learning_rate": 1.3660227330648861e-05, "loss": 1.0324, "step": 17340 }, { "epoch": 0.8663836163836164, "grad_norm": 1.8490986824035645, "learning_rate": 1.3634741831897652e-05, "loss": 0.8992, "step": 17345 }, { "epoch": 0.8666333666333667, "grad_norm": 1.88497793674469, "learning_rate": 1.3609256333146439e-05, "loss": 0.9893, "step": 17350 }, { "epoch": 0.8668831168831169, "grad_norm": 2.81160831451416, "learning_rate": 1.3583770834395229e-05, "loss": 1.0869, "step": 17355 }, { "epoch": 0.8671328671328671, "grad_norm": 1.560278058052063, "learning_rate": 1.355828533564402e-05, "loss": 1.0054, "step": 17360 }, { "epoch": 0.8673826173826173, "grad_norm": 2.569840431213379, "learning_rate": 1.353279983689281e-05, "loss": 1.039, "step": 17365 }, { "epoch": 0.8676323676323676, "grad_norm": 1.9886165857315063, "learning_rate": 1.3507314338141597e-05, "loss": 1.0012, "step": 17370 }, { "epoch": 0.8678821178821179, "grad_norm": 1.6788328886032104, "learning_rate": 1.3481828839390388e-05, "loss": 0.9767, "step": 17375 }, { "epoch": 0.8681318681318682, "grad_norm": 1.8796966075897217, "learning_rate": 1.3456343340639176e-05, "loss": 1.0384, "step": 17380 }, { "epoch": 0.8683816183816184, "grad_norm": 1.853361964225769, "learning_rate": 1.3430857841887967e-05, "loss": 0.9532, "step": 17385 }, { "epoch": 0.8686313686313686, "grad_norm": 1.7327446937561035, "learning_rate": 1.3405372343136754e-05, "loss": 1.0042, "step": 17390 }, { "epoch": 0.8688811188811189, "grad_norm": 1.9971064329147339, "learning_rate": 1.3379886844385545e-05, "loss": 1.1211, "step": 17395 }, { "epoch": 0.8691308691308691, "grad_norm": 2.2282814979553223, "learning_rate": 1.3354401345634335e-05, "loss": 1.0341, "step": 17400 }, { "epoch": 0.8693806193806194, "grad_norm": 2.3488667011260986, "learning_rate": 1.3328915846883126e-05, "loss": 0.9356, "step": 17405 }, { "epoch": 0.8696303696303697, "grad_norm": 2.0104408264160156, "learning_rate": 1.3303430348131913e-05, "loss": 0.8631, "step": 17410 }, { "epoch": 0.8698801198801199, "grad_norm": 1.7662075757980347, "learning_rate": 1.3277944849380703e-05, "loss": 1.0383, "step": 17415 }, { "epoch": 0.8701298701298701, "grad_norm": 2.2956745624542236, "learning_rate": 1.3252459350629492e-05, "loss": 1.0784, "step": 17420 }, { "epoch": 0.8703796203796204, "grad_norm": 1.8537499904632568, "learning_rate": 1.3226973851878282e-05, "loss": 1.0004, "step": 17425 }, { "epoch": 0.8706293706293706, "grad_norm": 1.799378514289856, "learning_rate": 1.320148835312707e-05, "loss": 0.9903, "step": 17430 }, { "epoch": 0.8708791208791209, "grad_norm": 1.7341675758361816, "learning_rate": 1.317600285437586e-05, "loss": 0.9935, "step": 17435 }, { "epoch": 0.8711288711288712, "grad_norm": 1.5398197174072266, "learning_rate": 1.315051735562465e-05, "loss": 1.0036, "step": 17440 }, { "epoch": 0.8713786213786214, "grad_norm": 1.9615517854690552, "learning_rate": 1.3125031856873441e-05, "loss": 0.9703, "step": 17445 }, { "epoch": 0.8716283716283716, "grad_norm": 1.8382221460342407, "learning_rate": 1.3099546358122228e-05, "loss": 1.1091, "step": 17450 }, { "epoch": 0.8718781218781219, "grad_norm": 1.8736823797225952, "learning_rate": 1.3074060859371019e-05, "loss": 0.895, "step": 17455 }, { "epoch": 0.8721278721278721, "grad_norm": 1.541106104850769, "learning_rate": 1.3048575360619807e-05, "loss": 1.0772, "step": 17460 }, { "epoch": 0.8723776223776224, "grad_norm": 2.1284050941467285, "learning_rate": 1.3023089861868598e-05, "loss": 1.0394, "step": 17465 }, { "epoch": 0.8726273726273727, "grad_norm": 2.53953218460083, "learning_rate": 1.2997604363117385e-05, "loss": 0.9683, "step": 17470 }, { "epoch": 0.8728771228771228, "grad_norm": 1.9295132160186768, "learning_rate": 1.2972118864366175e-05, "loss": 0.9485, "step": 17475 }, { "epoch": 0.8731268731268731, "grad_norm": 1.9176146984100342, "learning_rate": 1.2946633365614966e-05, "loss": 1.0663, "step": 17480 }, { "epoch": 0.8733766233766234, "grad_norm": 2.833543062210083, "learning_rate": 1.2921147866863756e-05, "loss": 1.0613, "step": 17485 }, { "epoch": 0.8736263736263736, "grad_norm": 2.0927138328552246, "learning_rate": 1.2895662368112543e-05, "loss": 0.9558, "step": 17490 }, { "epoch": 0.8738761238761239, "grad_norm": 1.8688205480575562, "learning_rate": 1.2870176869361334e-05, "loss": 1.013, "step": 17495 }, { "epoch": 0.8741258741258742, "grad_norm": 2.0195364952087402, "learning_rate": 1.2844691370610123e-05, "loss": 1.0629, "step": 17500 }, { "epoch": 0.8743756243756243, "grad_norm": 1.9880263805389404, "learning_rate": 1.2819205871858913e-05, "loss": 0.9718, "step": 17505 }, { "epoch": 0.8746253746253746, "grad_norm": 2.1667869091033936, "learning_rate": 1.27937203731077e-05, "loss": 1.0007, "step": 17510 }, { "epoch": 0.8748751248751249, "grad_norm": 1.8646303415298462, "learning_rate": 1.276823487435649e-05, "loss": 1.0349, "step": 17515 }, { "epoch": 0.8751248751248751, "grad_norm": 1.9775793552398682, "learning_rate": 1.2742749375605281e-05, "loss": 1.0721, "step": 17520 }, { "epoch": 0.8753746253746254, "grad_norm": 1.851569414138794, "learning_rate": 1.2717263876854072e-05, "loss": 0.9632, "step": 17525 }, { "epoch": 0.8756243756243757, "grad_norm": 2.5530099868774414, "learning_rate": 1.2691778378102859e-05, "loss": 0.9691, "step": 17530 }, { "epoch": 0.8758741258741258, "grad_norm": 1.6354700326919556, "learning_rate": 1.266629287935165e-05, "loss": 1.1604, "step": 17535 }, { "epoch": 0.8761238761238761, "grad_norm": 2.2466964721679688, "learning_rate": 1.2640807380600438e-05, "loss": 1.124, "step": 17540 }, { "epoch": 0.8763736263736264, "grad_norm": 2.2455317974090576, "learning_rate": 1.2615321881849229e-05, "loss": 1.1073, "step": 17545 }, { "epoch": 0.8766233766233766, "grad_norm": 1.7227916717529297, "learning_rate": 1.2589836383098016e-05, "loss": 1.1204, "step": 17550 }, { "epoch": 0.8768731268731269, "grad_norm": 1.7173209190368652, "learning_rate": 1.2564350884346806e-05, "loss": 1.0549, "step": 17555 }, { "epoch": 0.8771228771228772, "grad_norm": 2.0812530517578125, "learning_rate": 1.2538865385595597e-05, "loss": 1.0131, "step": 17560 }, { "epoch": 0.8773726273726273, "grad_norm": 1.9836565256118774, "learning_rate": 1.2513379886844387e-05, "loss": 1.0343, "step": 17565 }, { "epoch": 0.8776223776223776, "grad_norm": 2.1579108238220215, "learning_rate": 1.2487894388093176e-05, "loss": 0.9726, "step": 17570 }, { "epoch": 0.8778721278721279, "grad_norm": 1.9863333702087402, "learning_rate": 1.2462408889341965e-05, "loss": 1.0222, "step": 17575 }, { "epoch": 0.8781218781218781, "grad_norm": 1.9387181997299194, "learning_rate": 1.2436923390590755e-05, "loss": 0.891, "step": 17580 }, { "epoch": 0.8783716283716284, "grad_norm": 1.6019679307937622, "learning_rate": 1.2411437891839542e-05, "loss": 0.9837, "step": 17585 }, { "epoch": 0.8786213786213786, "grad_norm": 2.072643995285034, "learning_rate": 1.2385952393088333e-05, "loss": 0.9472, "step": 17590 }, { "epoch": 0.8788711288711288, "grad_norm": 1.7814751863479614, "learning_rate": 1.2360466894337122e-05, "loss": 0.9327, "step": 17595 }, { "epoch": 0.8791208791208791, "grad_norm": 2.639600992202759, "learning_rate": 1.2334981395585912e-05, "loss": 1.0656, "step": 17600 }, { "epoch": 0.8793706293706294, "grad_norm": 2.267063856124878, "learning_rate": 1.2309495896834701e-05, "loss": 1.0431, "step": 17605 }, { "epoch": 0.8796203796203796, "grad_norm": 1.8464374542236328, "learning_rate": 1.2284010398083491e-05, "loss": 1.0967, "step": 17610 }, { "epoch": 0.8798701298701299, "grad_norm": 3.5855698585510254, "learning_rate": 1.225852489933228e-05, "loss": 1.0367, "step": 17615 }, { "epoch": 0.8801198801198801, "grad_norm": 2.0481770038604736, "learning_rate": 1.223303940058107e-05, "loss": 1.0223, "step": 17620 }, { "epoch": 0.8803696303696303, "grad_norm": 2.2424018383026123, "learning_rate": 1.220755390182986e-05, "loss": 0.9734, "step": 17625 }, { "epoch": 0.8806193806193806, "grad_norm": 1.7956483364105225, "learning_rate": 1.2182068403078648e-05, "loss": 1.0998, "step": 17630 }, { "epoch": 0.8808691308691309, "grad_norm": 1.758476734161377, "learning_rate": 1.2156582904327437e-05, "loss": 0.8978, "step": 17635 }, { "epoch": 0.8811188811188811, "grad_norm": 2.1964128017425537, "learning_rate": 1.2131097405576228e-05, "loss": 0.9733, "step": 17640 }, { "epoch": 0.8813686313686314, "grad_norm": 1.8912991285324097, "learning_rate": 1.2105611906825016e-05, "loss": 0.9697, "step": 17645 }, { "epoch": 0.8816183816183816, "grad_norm": 1.8740358352661133, "learning_rate": 1.2080126408073807e-05, "loss": 1.0996, "step": 17650 }, { "epoch": 0.8818681318681318, "grad_norm": 2.5932888984680176, "learning_rate": 1.2054640909322596e-05, "loss": 0.9584, "step": 17655 }, { "epoch": 0.8821178821178821, "grad_norm": 1.695670247077942, "learning_rate": 1.2029155410571386e-05, "loss": 1.0548, "step": 17660 }, { "epoch": 0.8823676323676324, "grad_norm": 1.5623981952667236, "learning_rate": 1.2003669911820175e-05, "loss": 1.0443, "step": 17665 }, { "epoch": 0.8826173826173827, "grad_norm": 1.7407293319702148, "learning_rate": 1.1978184413068964e-05, "loss": 1.0247, "step": 17670 }, { "epoch": 0.8828671328671329, "grad_norm": 1.8472826480865479, "learning_rate": 1.1952698914317753e-05, "loss": 0.9662, "step": 17675 }, { "epoch": 0.8831168831168831, "grad_norm": 1.9506711959838867, "learning_rate": 1.1927213415566543e-05, "loss": 1.0738, "step": 17680 }, { "epoch": 0.8833666333666333, "grad_norm": 2.2110369205474854, "learning_rate": 1.1901727916815332e-05, "loss": 1.0545, "step": 17685 }, { "epoch": 0.8836163836163836, "grad_norm": 1.8195209503173828, "learning_rate": 1.1876242418064122e-05, "loss": 0.9882, "step": 17690 }, { "epoch": 0.8838661338661339, "grad_norm": 1.7497825622558594, "learning_rate": 1.1850756919312911e-05, "loss": 1.0808, "step": 17695 }, { "epoch": 0.8841158841158842, "grad_norm": 1.6332266330718994, "learning_rate": 1.1825271420561702e-05, "loss": 1.0613, "step": 17700 }, { "epoch": 0.8843656343656343, "grad_norm": 1.8048315048217773, "learning_rate": 1.179978592181049e-05, "loss": 0.9612, "step": 17705 }, { "epoch": 0.8846153846153846, "grad_norm": 1.9236445426940918, "learning_rate": 1.177430042305928e-05, "loss": 0.9771, "step": 17710 }, { "epoch": 0.8848651348651349, "grad_norm": 2.026404619216919, "learning_rate": 1.1748814924308068e-05, "loss": 0.9147, "step": 17715 }, { "epoch": 0.8851148851148851, "grad_norm": 2.107342481613159, "learning_rate": 1.1723329425556858e-05, "loss": 0.9632, "step": 17720 }, { "epoch": 0.8853646353646354, "grad_norm": 2.2882838249206543, "learning_rate": 1.1697843926805647e-05, "loss": 0.9555, "step": 17725 }, { "epoch": 0.8856143856143857, "grad_norm": 1.9129308462142944, "learning_rate": 1.1672358428054438e-05, "loss": 1.0466, "step": 17730 }, { "epoch": 0.8858641358641358, "grad_norm": 1.9656294584274292, "learning_rate": 1.1646872929303227e-05, "loss": 0.9913, "step": 17735 }, { "epoch": 0.8861138861138861, "grad_norm": 1.7544306516647339, "learning_rate": 1.1621387430552017e-05, "loss": 1.05, "step": 17740 }, { "epoch": 0.8863636363636364, "grad_norm": 2.1499834060668945, "learning_rate": 1.1595901931800806e-05, "loss": 1.0332, "step": 17745 }, { "epoch": 0.8866133866133866, "grad_norm": 2.273444175720215, "learning_rate": 1.1570416433049595e-05, "loss": 1.068, "step": 17750 }, { "epoch": 0.8868631368631369, "grad_norm": 2.4082367420196533, "learning_rate": 1.1544930934298383e-05, "loss": 0.9895, "step": 17755 }, { "epoch": 0.8871128871128872, "grad_norm": 1.7601879835128784, "learning_rate": 1.1519445435547174e-05, "loss": 0.9225, "step": 17760 }, { "epoch": 0.8873626373626373, "grad_norm": 2.0017597675323486, "learning_rate": 1.1493959936795963e-05, "loss": 0.9747, "step": 17765 }, { "epoch": 0.8876123876123876, "grad_norm": 2.9899749755859375, "learning_rate": 1.1468474438044753e-05, "loss": 0.9153, "step": 17770 }, { "epoch": 0.8878621378621379, "grad_norm": 1.8050551414489746, "learning_rate": 1.1442988939293542e-05, "loss": 0.9799, "step": 17775 }, { "epoch": 0.8881118881118881, "grad_norm": 1.609790325164795, "learning_rate": 1.1417503440542332e-05, "loss": 0.9009, "step": 17780 }, { "epoch": 0.8883616383616384, "grad_norm": 1.5506865978240967, "learning_rate": 1.1392017941791121e-05, "loss": 0.9607, "step": 17785 }, { "epoch": 0.8886113886113887, "grad_norm": 1.8247127532958984, "learning_rate": 1.136653244303991e-05, "loss": 1.0322, "step": 17790 }, { "epoch": 0.8888611388611388, "grad_norm": 2.559390068054199, "learning_rate": 1.1341046944288699e-05, "loss": 1.0734, "step": 17795 }, { "epoch": 0.8891108891108891, "grad_norm": 2.556509017944336, "learning_rate": 1.131556144553749e-05, "loss": 1.0431, "step": 17800 }, { "epoch": 0.8893606393606394, "grad_norm": 1.9374778270721436, "learning_rate": 1.1290075946786278e-05, "loss": 1.0267, "step": 17805 }, { "epoch": 0.8896103896103896, "grad_norm": 1.9277896881103516, "learning_rate": 1.1264590448035069e-05, "loss": 1.0169, "step": 17810 }, { "epoch": 0.8898601398601399, "grad_norm": 1.7669763565063477, "learning_rate": 1.1239104949283857e-05, "loss": 0.9244, "step": 17815 }, { "epoch": 0.8901098901098901, "grad_norm": 1.6849350929260254, "learning_rate": 1.1213619450532648e-05, "loss": 0.8814, "step": 17820 }, { "epoch": 0.8903596403596403, "grad_norm": 2.6589133739471436, "learning_rate": 1.1188133951781437e-05, "loss": 0.9377, "step": 17825 }, { "epoch": 0.8906093906093906, "grad_norm": 1.9943712949752808, "learning_rate": 1.1162648453030227e-05, "loss": 0.9734, "step": 17830 }, { "epoch": 0.8908591408591409, "grad_norm": 2.023123264312744, "learning_rate": 1.1137162954279014e-05, "loss": 1.0245, "step": 17835 }, { "epoch": 0.8911088911088911, "grad_norm": 1.7804713249206543, "learning_rate": 1.1111677455527805e-05, "loss": 1.0375, "step": 17840 }, { "epoch": 0.8913586413586414, "grad_norm": 1.5409646034240723, "learning_rate": 1.1086191956776594e-05, "loss": 0.9648, "step": 17845 }, { "epoch": 0.8916083916083916, "grad_norm": 1.8771456480026245, "learning_rate": 1.1060706458025384e-05, "loss": 1.0418, "step": 17850 }, { "epoch": 0.8918581418581418, "grad_norm": 1.8884022235870361, "learning_rate": 1.1035220959274173e-05, "loss": 1.0099, "step": 17855 }, { "epoch": 0.8921078921078921, "grad_norm": 1.7785155773162842, "learning_rate": 1.1009735460522963e-05, "loss": 1.0098, "step": 17860 }, { "epoch": 0.8923576423576424, "grad_norm": 1.923071026802063, "learning_rate": 1.0984249961771752e-05, "loss": 1.0402, "step": 17865 }, { "epoch": 0.8926073926073926, "grad_norm": 1.971932291984558, "learning_rate": 1.0958764463020543e-05, "loss": 1.0704, "step": 17870 }, { "epoch": 0.8928571428571429, "grad_norm": 1.899057149887085, "learning_rate": 1.0933278964269331e-05, "loss": 1.1201, "step": 17875 }, { "epoch": 0.8931068931068931, "grad_norm": 1.5085813999176025, "learning_rate": 1.090779346551812e-05, "loss": 1.0108, "step": 17880 }, { "epoch": 0.8933566433566433, "grad_norm": 1.7506465911865234, "learning_rate": 1.0882307966766909e-05, "loss": 0.9995, "step": 17885 }, { "epoch": 0.8936063936063936, "grad_norm": 1.9170492887496948, "learning_rate": 1.08568224680157e-05, "loss": 0.9199, "step": 17890 }, { "epoch": 0.8938561438561439, "grad_norm": 1.731548547744751, "learning_rate": 1.0831336969264488e-05, "loss": 1.0158, "step": 17895 }, { "epoch": 0.8941058941058941, "grad_norm": 1.777715802192688, "learning_rate": 1.0805851470513279e-05, "loss": 1.0237, "step": 17900 }, { "epoch": 0.8943556443556444, "grad_norm": 2.019514322280884, "learning_rate": 1.0780365971762068e-05, "loss": 0.9998, "step": 17905 }, { "epoch": 0.8946053946053946, "grad_norm": 1.9748470783233643, "learning_rate": 1.0754880473010858e-05, "loss": 0.9649, "step": 17910 }, { "epoch": 0.8948551448551448, "grad_norm": 1.6097402572631836, "learning_rate": 1.0729394974259647e-05, "loss": 1.0738, "step": 17915 }, { "epoch": 0.8951048951048951, "grad_norm": 2.079040050506592, "learning_rate": 1.0703909475508436e-05, "loss": 0.9712, "step": 17920 }, { "epoch": 0.8953546453546454, "grad_norm": 2.5154991149902344, "learning_rate": 1.0678423976757224e-05, "loss": 1.0621, "step": 17925 }, { "epoch": 0.8956043956043956, "grad_norm": 1.8476665019989014, "learning_rate": 1.0652938478006015e-05, "loss": 1.0317, "step": 17930 }, { "epoch": 0.8958541458541458, "grad_norm": 1.5467174053192139, "learning_rate": 1.0627452979254804e-05, "loss": 1.0021, "step": 17935 }, { "epoch": 0.8961038961038961, "grad_norm": 2.2992076873779297, "learning_rate": 1.0601967480503594e-05, "loss": 1.0141, "step": 17940 }, { "epoch": 0.8963536463536463, "grad_norm": 2.026481866836548, "learning_rate": 1.0576481981752383e-05, "loss": 1.0389, "step": 17945 }, { "epoch": 0.8966033966033966, "grad_norm": 2.0690081119537354, "learning_rate": 1.0550996483001173e-05, "loss": 1.0906, "step": 17950 }, { "epoch": 0.8968531468531469, "grad_norm": 1.877111792564392, "learning_rate": 1.0525510984249962e-05, "loss": 1.0355, "step": 17955 }, { "epoch": 0.8971028971028971, "grad_norm": 1.9008266925811768, "learning_rate": 1.0500025485498751e-05, "loss": 1.0125, "step": 17960 }, { "epoch": 0.8973526473526473, "grad_norm": 2.093635082244873, "learning_rate": 1.047453998674754e-05, "loss": 1.0093, "step": 17965 }, { "epoch": 0.8976023976023976, "grad_norm": 1.7617932558059692, "learning_rate": 1.044905448799633e-05, "loss": 1.0745, "step": 17970 }, { "epoch": 0.8978521478521478, "grad_norm": 1.8974730968475342, "learning_rate": 1.0423568989245119e-05, "loss": 1.0745, "step": 17975 }, { "epoch": 0.8981018981018981, "grad_norm": 2.243251323699951, "learning_rate": 1.039808349049391e-05, "loss": 1.034, "step": 17980 }, { "epoch": 0.8983516483516484, "grad_norm": 1.9627981185913086, "learning_rate": 1.0372597991742698e-05, "loss": 1.131, "step": 17985 }, { "epoch": 0.8986013986013986, "grad_norm": 1.7138078212738037, "learning_rate": 1.0347112492991489e-05, "loss": 1.0511, "step": 17990 }, { "epoch": 0.8988511488511488, "grad_norm": 1.9871160984039307, "learning_rate": 1.0321626994240278e-05, "loss": 1.0021, "step": 17995 }, { "epoch": 0.8991008991008991, "grad_norm": 1.7271416187286377, "learning_rate": 1.0296141495489066e-05, "loss": 1.154, "step": 18000 }, { "epoch": 0.8993506493506493, "grad_norm": 2.4654324054718018, "learning_rate": 1.0270655996737855e-05, "loss": 0.9249, "step": 18005 }, { "epoch": 0.8996003996003996, "grad_norm": 2.026982307434082, "learning_rate": 1.0245170497986646e-05, "loss": 1.0665, "step": 18010 }, { "epoch": 0.8998501498501499, "grad_norm": 1.5616885423660278, "learning_rate": 1.0219684999235435e-05, "loss": 1.0365, "step": 18015 }, { "epoch": 0.9000999000999002, "grad_norm": 2.1697912216186523, "learning_rate": 1.0194199500484225e-05, "loss": 0.9141, "step": 18020 }, { "epoch": 0.9003496503496503, "grad_norm": 1.8518950939178467, "learning_rate": 1.0168714001733014e-05, "loss": 0.9782, "step": 18025 }, { "epoch": 0.9005994005994006, "grad_norm": 2.2222564220428467, "learning_rate": 1.0143228502981804e-05, "loss": 1.0388, "step": 18030 }, { "epoch": 0.9008491508491508, "grad_norm": 2.1649131774902344, "learning_rate": 1.0117743004230593e-05, "loss": 1.0365, "step": 18035 }, { "epoch": 0.9010989010989011, "grad_norm": 3.0204410552978516, "learning_rate": 1.0092257505479382e-05, "loss": 0.9872, "step": 18040 }, { "epoch": 0.9013486513486514, "grad_norm": 1.9031206369400024, "learning_rate": 1.006677200672817e-05, "loss": 1.0422, "step": 18045 }, { "epoch": 0.9015984015984015, "grad_norm": 1.7667886018753052, "learning_rate": 1.0041286507976961e-05, "loss": 0.9394, "step": 18050 }, { "epoch": 0.9018481518481518, "grad_norm": 3.504326105117798, "learning_rate": 1.001580100922575e-05, "loss": 1.0903, "step": 18055 }, { "epoch": 0.9020979020979021, "grad_norm": 1.9668633937835693, "learning_rate": 9.99031551047454e-06, "loss": 0.9013, "step": 18060 }, { "epoch": 0.9023476523476524, "grad_norm": 2.3647449016571045, "learning_rate": 9.96483001172333e-06, "loss": 1.1462, "step": 18065 }, { "epoch": 0.9025974025974026, "grad_norm": 1.8012830018997192, "learning_rate": 9.93934451297212e-06, "loss": 0.9491, "step": 18070 }, { "epoch": 0.9028471528471529, "grad_norm": 1.6024144887924194, "learning_rate": 9.913859014220909e-06, "loss": 0.9263, "step": 18075 }, { "epoch": 0.903096903096903, "grad_norm": 3.3394687175750732, "learning_rate": 9.888373515469699e-06, "loss": 1.1108, "step": 18080 }, { "epoch": 0.9033466533466533, "grad_norm": 1.8925690650939941, "learning_rate": 9.862888016718486e-06, "loss": 0.9806, "step": 18085 }, { "epoch": 0.9035964035964036, "grad_norm": 2.6975667476654053, "learning_rate": 9.837402517967277e-06, "loss": 0.9843, "step": 18090 }, { "epoch": 0.9038461538461539, "grad_norm": 1.3971058130264282, "learning_rate": 9.811917019216065e-06, "loss": 0.9539, "step": 18095 }, { "epoch": 0.9040959040959041, "grad_norm": 2.6721441745758057, "learning_rate": 9.786431520464856e-06, "loss": 0.9829, "step": 18100 }, { "epoch": 0.9043456543456544, "grad_norm": 1.9569125175476074, "learning_rate": 9.760946021713645e-06, "loss": 1.0171, "step": 18105 }, { "epoch": 0.9045954045954046, "grad_norm": 1.9756898880004883, "learning_rate": 9.735460522962435e-06, "loss": 0.9274, "step": 18110 }, { "epoch": 0.9048451548451548, "grad_norm": 1.6632827520370483, "learning_rate": 9.709975024211224e-06, "loss": 1.0464, "step": 18115 }, { "epoch": 0.9050949050949051, "grad_norm": 3.111236810684204, "learning_rate": 9.684489525460014e-06, "loss": 1.0405, "step": 18120 }, { "epoch": 0.9053446553446554, "grad_norm": 2.4695518016815186, "learning_rate": 9.659004026708803e-06, "loss": 0.9845, "step": 18125 }, { "epoch": 0.9055944055944056, "grad_norm": 1.6791305541992188, "learning_rate": 9.633518527957592e-06, "loss": 0.9877, "step": 18130 }, { "epoch": 0.9058441558441559, "grad_norm": 2.053934097290039, "learning_rate": 9.608033029206381e-06, "loss": 0.9062, "step": 18135 }, { "epoch": 0.906093906093906, "grad_norm": 1.9258638620376587, "learning_rate": 9.582547530455171e-06, "loss": 1.0656, "step": 18140 }, { "epoch": 0.9063436563436563, "grad_norm": 1.7634193897247314, "learning_rate": 9.55706203170396e-06, "loss": 1.0365, "step": 18145 }, { "epoch": 0.9065934065934066, "grad_norm": 2.2951254844665527, "learning_rate": 9.53157653295275e-06, "loss": 1.0593, "step": 18150 }, { "epoch": 0.9068431568431569, "grad_norm": 3.1595919132232666, "learning_rate": 9.50609103420154e-06, "loss": 0.9488, "step": 18155 }, { "epoch": 0.9070929070929071, "grad_norm": 1.4009931087493896, "learning_rate": 9.48060553545033e-06, "loss": 0.9627, "step": 18160 }, { "epoch": 0.9073426573426573, "grad_norm": 2.1011440753936768, "learning_rate": 9.455120036699119e-06, "loss": 1.0102, "step": 18165 }, { "epoch": 0.9075924075924076, "grad_norm": 2.3853073120117188, "learning_rate": 9.429634537947908e-06, "loss": 0.9446, "step": 18170 }, { "epoch": 0.9078421578421578, "grad_norm": 1.587053894996643, "learning_rate": 9.404149039196696e-06, "loss": 1.0006, "step": 18175 }, { "epoch": 0.9080919080919081, "grad_norm": 1.9235889911651611, "learning_rate": 9.378663540445487e-06, "loss": 1.081, "step": 18180 }, { "epoch": 0.9083416583416584, "grad_norm": 1.3626207113265991, "learning_rate": 9.353178041694276e-06, "loss": 1.0029, "step": 18185 }, { "epoch": 0.9085914085914086, "grad_norm": 2.8458852767944336, "learning_rate": 9.327692542943066e-06, "loss": 1.0582, "step": 18190 }, { "epoch": 0.9088411588411588, "grad_norm": 1.8635680675506592, "learning_rate": 9.302207044191855e-06, "loss": 0.952, "step": 18195 }, { "epoch": 0.9090909090909091, "grad_norm": 1.9722247123718262, "learning_rate": 9.276721545440645e-06, "loss": 1.0151, "step": 18200 }, { "epoch": 0.9093406593406593, "grad_norm": 1.5659449100494385, "learning_rate": 9.251236046689434e-06, "loss": 1.0534, "step": 18205 }, { "epoch": 0.9095904095904096, "grad_norm": 1.7085493803024292, "learning_rate": 9.225750547938223e-06, "loss": 0.9884, "step": 18210 }, { "epoch": 0.9098401598401599, "grad_norm": 1.7360917329788208, "learning_rate": 9.200265049187012e-06, "loss": 1.034, "step": 18215 }, { "epoch": 0.9100899100899101, "grad_norm": 1.7596150636672974, "learning_rate": 9.174779550435802e-06, "loss": 1.0227, "step": 18220 }, { "epoch": 0.9103396603396603, "grad_norm": 1.5227673053741455, "learning_rate": 9.149294051684591e-06, "loss": 1.0018, "step": 18225 }, { "epoch": 0.9105894105894106, "grad_norm": 1.9915071725845337, "learning_rate": 9.123808552933381e-06, "loss": 1.0133, "step": 18230 }, { "epoch": 0.9108391608391608, "grad_norm": 1.851678490638733, "learning_rate": 9.09832305418217e-06, "loss": 0.961, "step": 18235 }, { "epoch": 0.9110889110889111, "grad_norm": 2.1686151027679443, "learning_rate": 9.07283755543096e-06, "loss": 0.946, "step": 18240 }, { "epoch": 0.9113386613386614, "grad_norm": 1.8354591131210327, "learning_rate": 9.04735205667975e-06, "loss": 0.9203, "step": 18245 }, { "epoch": 0.9115884115884116, "grad_norm": 2.3302810192108154, "learning_rate": 9.021866557928538e-06, "loss": 1.0291, "step": 18250 }, { "epoch": 0.9118381618381618, "grad_norm": 2.3796472549438477, "learning_rate": 8.996381059177327e-06, "loss": 0.9375, "step": 18255 }, { "epoch": 0.9120879120879121, "grad_norm": 2.341513156890869, "learning_rate": 8.970895560426118e-06, "loss": 1.0027, "step": 18260 }, { "epoch": 0.9123376623376623, "grad_norm": 1.3514758348464966, "learning_rate": 8.945410061674906e-06, "loss": 1.093, "step": 18265 }, { "epoch": 0.9125874125874126, "grad_norm": 1.8483206033706665, "learning_rate": 8.919924562923697e-06, "loss": 1.0079, "step": 18270 }, { "epoch": 0.9128371628371629, "grad_norm": 2.536463499069214, "learning_rate": 8.894439064172486e-06, "loss": 1.0618, "step": 18275 }, { "epoch": 0.913086913086913, "grad_norm": 2.0243031978607178, "learning_rate": 8.868953565421276e-06, "loss": 1.1269, "step": 18280 }, { "epoch": 0.9133366633366633, "grad_norm": 2.46791934967041, "learning_rate": 8.843468066670065e-06, "loss": 0.9604, "step": 18285 }, { "epoch": 0.9135864135864136, "grad_norm": 1.6411150693893433, "learning_rate": 8.817982567918854e-06, "loss": 0.8838, "step": 18290 }, { "epoch": 0.9138361638361638, "grad_norm": 1.7467955350875854, "learning_rate": 8.792497069167643e-06, "loss": 0.9706, "step": 18295 }, { "epoch": 0.9140859140859141, "grad_norm": 1.6048816442489624, "learning_rate": 8.767011570416433e-06, "loss": 0.9614, "step": 18300 }, { "epoch": 0.9143356643356644, "grad_norm": 1.8179031610488892, "learning_rate": 8.741526071665222e-06, "loss": 1.0718, "step": 18305 }, { "epoch": 0.9145854145854145, "grad_norm": 2.049791097640991, "learning_rate": 8.716040572914012e-06, "loss": 1.004, "step": 18310 }, { "epoch": 0.9148351648351648, "grad_norm": 2.91074538230896, "learning_rate": 8.690555074162801e-06, "loss": 1.0329, "step": 18315 }, { "epoch": 0.9150849150849151, "grad_norm": 1.884260654449463, "learning_rate": 8.665069575411592e-06, "loss": 1.0509, "step": 18320 }, { "epoch": 0.9153346653346653, "grad_norm": 2.3252034187316895, "learning_rate": 8.63958407666038e-06, "loss": 0.9293, "step": 18325 }, { "epoch": 0.9155844155844156, "grad_norm": 3.4706296920776367, "learning_rate": 8.614098577909171e-06, "loss": 1.0658, "step": 18330 }, { "epoch": 0.9158341658341659, "grad_norm": 2.3579697608947754, "learning_rate": 8.58861307915796e-06, "loss": 0.9759, "step": 18335 }, { "epoch": 0.916083916083916, "grad_norm": 1.7159534692764282, "learning_rate": 8.563127580406749e-06, "loss": 1.0577, "step": 18340 }, { "epoch": 0.9163336663336663, "grad_norm": 1.8022551536560059, "learning_rate": 8.537642081655537e-06, "loss": 1.0721, "step": 18345 }, { "epoch": 0.9165834165834166, "grad_norm": 2.1520578861236572, "learning_rate": 8.512156582904328e-06, "loss": 1.0087, "step": 18350 }, { "epoch": 0.9168331668331668, "grad_norm": 1.723617434501648, "learning_rate": 8.486671084153117e-06, "loss": 0.9418, "step": 18355 }, { "epoch": 0.9170829170829171, "grad_norm": 3.2345311641693115, "learning_rate": 8.461185585401907e-06, "loss": 1.094, "step": 18360 }, { "epoch": 0.9173326673326674, "grad_norm": 1.6832287311553955, "learning_rate": 8.435700086650696e-06, "loss": 1.0844, "step": 18365 }, { "epoch": 0.9175824175824175, "grad_norm": 1.665411114692688, "learning_rate": 8.410214587899486e-06, "loss": 1.0432, "step": 18370 }, { "epoch": 0.9178321678321678, "grad_norm": 1.6689794063568115, "learning_rate": 8.384729089148275e-06, "loss": 0.9386, "step": 18375 }, { "epoch": 0.9180819180819181, "grad_norm": 1.7706012725830078, "learning_rate": 8.359243590397064e-06, "loss": 1.1097, "step": 18380 }, { "epoch": 0.9183316683316683, "grad_norm": 1.982056736946106, "learning_rate": 8.333758091645853e-06, "loss": 1.0258, "step": 18385 }, { "epoch": 0.9185814185814186, "grad_norm": 1.3514657020568848, "learning_rate": 8.308272592894643e-06, "loss": 0.9493, "step": 18390 }, { "epoch": 0.9188311688311688, "grad_norm": 1.6164677143096924, "learning_rate": 8.282787094143432e-06, "loss": 0.8889, "step": 18395 }, { "epoch": 0.919080919080919, "grad_norm": 2.169297695159912, "learning_rate": 8.257301595392223e-06, "loss": 0.9938, "step": 18400 }, { "epoch": 0.9193306693306693, "grad_norm": 1.965470314025879, "learning_rate": 8.231816096641011e-06, "loss": 1.0453, "step": 18405 }, { "epoch": 0.9195804195804196, "grad_norm": 1.8052856922149658, "learning_rate": 8.206330597889802e-06, "loss": 1.0728, "step": 18410 }, { "epoch": 0.9198301698301699, "grad_norm": 2.1651053428649902, "learning_rate": 8.18084509913859e-06, "loss": 0.9957, "step": 18415 }, { "epoch": 0.9200799200799201, "grad_norm": 1.9064651727676392, "learning_rate": 8.15535960038738e-06, "loss": 1.0635, "step": 18420 }, { "epoch": 0.9203296703296703, "grad_norm": 1.7845244407653809, "learning_rate": 8.129874101636168e-06, "loss": 1.0535, "step": 18425 }, { "epoch": 0.9205794205794205, "grad_norm": 2.178215980529785, "learning_rate": 8.104388602884959e-06, "loss": 1.0371, "step": 18430 }, { "epoch": 0.9208291708291708, "grad_norm": 1.77186119556427, "learning_rate": 8.078903104133747e-06, "loss": 0.9873, "step": 18435 }, { "epoch": 0.9210789210789211, "grad_norm": 2.1464548110961914, "learning_rate": 8.053417605382538e-06, "loss": 1.0178, "step": 18440 }, { "epoch": 0.9213286713286714, "grad_norm": 1.7870804071426392, "learning_rate": 8.027932106631327e-06, "loss": 0.9716, "step": 18445 }, { "epoch": 0.9215784215784216, "grad_norm": 1.834768295288086, "learning_rate": 8.002446607880117e-06, "loss": 0.9086, "step": 18450 }, { "epoch": 0.9218281718281718, "grad_norm": 2.5277905464172363, "learning_rate": 7.976961109128906e-06, "loss": 0.9798, "step": 18455 }, { "epoch": 0.922077922077922, "grad_norm": 2.3377206325531006, "learning_rate": 7.951475610377695e-06, "loss": 0.9665, "step": 18460 }, { "epoch": 0.9223276723276723, "grad_norm": 1.4897335767745972, "learning_rate": 7.925990111626484e-06, "loss": 1.0876, "step": 18465 }, { "epoch": 0.9225774225774226, "grad_norm": 1.7513105869293213, "learning_rate": 7.900504612875274e-06, "loss": 0.9951, "step": 18470 }, { "epoch": 0.9228271728271729, "grad_norm": 2.4313504695892334, "learning_rate": 7.875019114124063e-06, "loss": 0.9491, "step": 18475 }, { "epoch": 0.9230769230769231, "grad_norm": 1.8135241270065308, "learning_rate": 7.849533615372853e-06, "loss": 1.1494, "step": 18480 }, { "epoch": 0.9233266733266733, "grad_norm": 1.9991563558578491, "learning_rate": 7.824048116621642e-06, "loss": 1.051, "step": 18485 }, { "epoch": 0.9235764235764236, "grad_norm": 2.1293344497680664, "learning_rate": 7.798562617870433e-06, "loss": 0.9881, "step": 18490 }, { "epoch": 0.9238261738261738, "grad_norm": 2.123722791671753, "learning_rate": 7.773077119119221e-06, "loss": 0.9916, "step": 18495 }, { "epoch": 0.9240759240759241, "grad_norm": 1.711283564567566, "learning_rate": 7.74759162036801e-06, "loss": 1.0417, "step": 18500 }, { "epoch": 0.9243256743256744, "grad_norm": 1.7089015245437622, "learning_rate": 7.722106121616799e-06, "loss": 0.9918, "step": 18505 }, { "epoch": 0.9245754245754245, "grad_norm": 1.7579046487808228, "learning_rate": 7.69662062286559e-06, "loss": 0.9934, "step": 18510 }, { "epoch": 0.9248251748251748, "grad_norm": 1.9429951906204224, "learning_rate": 7.671135124114378e-06, "loss": 0.9949, "step": 18515 }, { "epoch": 0.9250749250749251, "grad_norm": 1.797650694847107, "learning_rate": 7.645649625363169e-06, "loss": 0.9645, "step": 18520 }, { "epoch": 0.9253246753246753, "grad_norm": 1.9151942729949951, "learning_rate": 7.620164126611958e-06, "loss": 0.9293, "step": 18525 }, { "epoch": 0.9255744255744256, "grad_norm": 2.59161114692688, "learning_rate": 7.594678627860747e-06, "loss": 1.1238, "step": 18530 }, { "epoch": 0.9258241758241759, "grad_norm": 1.7695032358169556, "learning_rate": 7.569193129109536e-06, "loss": 0.8802, "step": 18535 }, { "epoch": 0.926073926073926, "grad_norm": 2.31372332572937, "learning_rate": 7.5437076303583265e-06, "loss": 1.0127, "step": 18540 }, { "epoch": 0.9263236763236763, "grad_norm": 2.191908597946167, "learning_rate": 7.518222131607115e-06, "loss": 0.9565, "step": 18545 }, { "epoch": 0.9265734265734266, "grad_norm": 1.8730500936508179, "learning_rate": 7.492736632855906e-06, "loss": 1.0577, "step": 18550 }, { "epoch": 0.9268231768231768, "grad_norm": 2.016209125518799, "learning_rate": 7.467251134104694e-06, "loss": 1.024, "step": 18555 }, { "epoch": 0.9270729270729271, "grad_norm": 1.6173129081726074, "learning_rate": 7.441765635353484e-06, "loss": 1.0519, "step": 18560 }, { "epoch": 0.9273226773226774, "grad_norm": 1.9093657732009888, "learning_rate": 7.416280136602273e-06, "loss": 0.9577, "step": 18565 }, { "epoch": 0.9275724275724275, "grad_norm": 2.0689358711242676, "learning_rate": 7.3907946378510635e-06, "loss": 1.0094, "step": 18570 }, { "epoch": 0.9278221778221778, "grad_norm": 1.6102666854858398, "learning_rate": 7.3653091390998515e-06, "loss": 1.0094, "step": 18575 }, { "epoch": 0.9280719280719281, "grad_norm": 2.1194565296173096, "learning_rate": 7.339823640348642e-06, "loss": 0.9607, "step": 18580 }, { "epoch": 0.9283216783216783, "grad_norm": 1.3749233484268188, "learning_rate": 7.314338141597431e-06, "loss": 0.8998, "step": 18585 }, { "epoch": 0.9285714285714286, "grad_norm": 2.4547903537750244, "learning_rate": 7.288852642846221e-06, "loss": 0.9795, "step": 18590 }, { "epoch": 0.9288211788211789, "grad_norm": 1.8093096017837524, "learning_rate": 7.26336714409501e-06, "loss": 0.9069, "step": 18595 }, { "epoch": 0.929070929070929, "grad_norm": 1.6335973739624023, "learning_rate": 7.2378816453438e-06, "loss": 1.0254, "step": 18600 }, { "epoch": 0.9293206793206793, "grad_norm": 1.5853934288024902, "learning_rate": 7.2123961465925885e-06, "loss": 0.9417, "step": 18605 }, { "epoch": 0.9295704295704296, "grad_norm": 2.123244285583496, "learning_rate": 7.186910647841379e-06, "loss": 0.9305, "step": 18610 }, { "epoch": 0.9298201798201798, "grad_norm": 2.003783702850342, "learning_rate": 7.161425149090168e-06, "loss": 1.048, "step": 18615 }, { "epoch": 0.9300699300699301, "grad_norm": 1.520440936088562, "learning_rate": 7.135939650338957e-06, "loss": 1.0321, "step": 18620 }, { "epoch": 0.9303196803196803, "grad_norm": 1.6507025957107544, "learning_rate": 7.110454151587746e-06, "loss": 0.9592, "step": 18625 }, { "epoch": 0.9305694305694305, "grad_norm": 1.7328503131866455, "learning_rate": 7.084968652836537e-06, "loss": 0.9997, "step": 18630 }, { "epoch": 0.9308191808191808, "grad_norm": 2.8881027698516846, "learning_rate": 7.0594831540853254e-06, "loss": 1.0301, "step": 18635 }, { "epoch": 0.9310689310689311, "grad_norm": 1.936767578125, "learning_rate": 7.033997655334115e-06, "loss": 0.9988, "step": 18640 }, { "epoch": 0.9313186813186813, "grad_norm": 1.6560795307159424, "learning_rate": 7.008512156582904e-06, "loss": 1.1132, "step": 18645 }, { "epoch": 0.9315684315684316, "grad_norm": 1.782376766204834, "learning_rate": 6.983026657831694e-06, "loss": 1.0863, "step": 18650 }, { "epoch": 0.9318181818181818, "grad_norm": 1.7928199768066406, "learning_rate": 6.957541159080483e-06, "loss": 1.1084, "step": 18655 }, { "epoch": 0.932067932067932, "grad_norm": 1.6718978881835938, "learning_rate": 6.932055660329273e-06, "loss": 0.9691, "step": 18660 }, { "epoch": 0.9323176823176823, "grad_norm": 1.9431606531143188, "learning_rate": 6.906570161578062e-06, "loss": 1.0085, "step": 18665 }, { "epoch": 0.9325674325674326, "grad_norm": 1.7922767400741577, "learning_rate": 6.881084662826852e-06, "loss": 0.9844, "step": 18670 }, { "epoch": 0.9328171828171828, "grad_norm": 2.7448134422302246, "learning_rate": 6.855599164075641e-06, "loss": 1.0783, "step": 18675 }, { "epoch": 0.9330669330669331, "grad_norm": 1.741610050201416, "learning_rate": 6.8301136653244305e-06, "loss": 0.9991, "step": 18680 }, { "epoch": 0.9333166833166833, "grad_norm": 1.6476861238479614, "learning_rate": 6.804628166573219e-06, "loss": 0.9887, "step": 18685 }, { "epoch": 0.9335664335664335, "grad_norm": 3.083534002304077, "learning_rate": 6.77914266782201e-06, "loss": 1.0085, "step": 18690 }, { "epoch": 0.9338161838161838, "grad_norm": 2.7006940841674805, "learning_rate": 6.753657169070799e-06, "loss": 0.9504, "step": 18695 }, { "epoch": 0.9340659340659341, "grad_norm": 1.752263069152832, "learning_rate": 6.728171670319588e-06, "loss": 0.9922, "step": 18700 }, { "epoch": 0.9343156843156843, "grad_norm": 1.7814507484436035, "learning_rate": 6.702686171568377e-06, "loss": 0.9799, "step": 18705 }, { "epoch": 0.9345654345654346, "grad_norm": 2.602396249771118, "learning_rate": 6.6772006728171675e-06, "loss": 1.0402, "step": 18710 }, { "epoch": 0.9348151848151848, "grad_norm": 1.4408291578292847, "learning_rate": 6.651715174065956e-06, "loss": 0.9724, "step": 18715 }, { "epoch": 0.935064935064935, "grad_norm": 1.8934217691421509, "learning_rate": 6.626229675314746e-06, "loss": 1.0242, "step": 18720 }, { "epoch": 0.9353146853146853, "grad_norm": 1.8760234117507935, "learning_rate": 6.600744176563535e-06, "loss": 0.9714, "step": 18725 }, { "epoch": 0.9355644355644356, "grad_norm": 1.8503422737121582, "learning_rate": 6.575258677812325e-06, "loss": 1.1164, "step": 18730 }, { "epoch": 0.9358141858141859, "grad_norm": 2.6395320892333984, "learning_rate": 6.549773179061114e-06, "loss": 1.0316, "step": 18735 }, { "epoch": 0.936063936063936, "grad_norm": 1.7112480401992798, "learning_rate": 6.524287680309904e-06, "loss": 0.92, "step": 18740 }, { "epoch": 0.9363136863136863, "grad_norm": 1.9978595972061157, "learning_rate": 6.4988021815586925e-06, "loss": 0.9617, "step": 18745 }, { "epoch": 0.9365634365634365, "grad_norm": 1.664827585220337, "learning_rate": 6.473316682807483e-06, "loss": 1.0528, "step": 18750 }, { "epoch": 0.9368131868131868, "grad_norm": 1.8503992557525635, "learning_rate": 6.447831184056272e-06, "loss": 0.9819, "step": 18755 }, { "epoch": 0.9370629370629371, "grad_norm": 2.2696139812469482, "learning_rate": 6.422345685305061e-06, "loss": 1.0408, "step": 18760 }, { "epoch": 0.9373126873126874, "grad_norm": 1.7503920793533325, "learning_rate": 6.39686018655385e-06, "loss": 1.0572, "step": 18765 }, { "epoch": 0.9375624375624375, "grad_norm": 2.784553289413452, "learning_rate": 6.371374687802641e-06, "loss": 0.9212, "step": 18770 }, { "epoch": 0.9378121878121878, "grad_norm": 2.096891403198242, "learning_rate": 6.3458891890514295e-06, "loss": 1.0127, "step": 18775 }, { "epoch": 0.938061938061938, "grad_norm": 2.3041465282440186, "learning_rate": 6.320403690300219e-06, "loss": 1.0585, "step": 18780 }, { "epoch": 0.9383116883116883, "grad_norm": 1.8708738088607788, "learning_rate": 6.294918191549008e-06, "loss": 0.9286, "step": 18785 }, { "epoch": 0.9385614385614386, "grad_norm": 1.8542126417160034, "learning_rate": 6.269432692797798e-06, "loss": 0.981, "step": 18790 }, { "epoch": 0.9388111888111889, "grad_norm": 1.8418537378311157, "learning_rate": 6.243947194046588e-06, "loss": 0.9547, "step": 18795 }, { "epoch": 0.939060939060939, "grad_norm": 2.425016164779663, "learning_rate": 6.218461695295378e-06, "loss": 1.0327, "step": 18800 }, { "epoch": 0.9393106893106893, "grad_norm": 1.7475553750991821, "learning_rate": 6.1929761965441665e-06, "loss": 0.9891, "step": 18805 }, { "epoch": 0.9395604395604396, "grad_norm": 1.732914686203003, "learning_rate": 6.167490697792956e-06, "loss": 1.0701, "step": 18810 }, { "epoch": 0.9398101898101898, "grad_norm": 2.1540682315826416, "learning_rate": 6.142005199041746e-06, "loss": 1.0517, "step": 18815 }, { "epoch": 0.9400599400599401, "grad_norm": 1.8808311223983765, "learning_rate": 6.116519700290535e-06, "loss": 1.0286, "step": 18820 }, { "epoch": 0.9403096903096904, "grad_norm": 1.845806360244751, "learning_rate": 6.091034201539324e-06, "loss": 1.0776, "step": 18825 }, { "epoch": 0.9405594405594405, "grad_norm": 1.7584632635116577, "learning_rate": 6.065548702788114e-06, "loss": 1.0718, "step": 18830 }, { "epoch": 0.9408091908091908, "grad_norm": 1.6171338558197021, "learning_rate": 6.0400632040369035e-06, "loss": 1.0544, "step": 18835 }, { "epoch": 0.9410589410589411, "grad_norm": 1.9683438539505005, "learning_rate": 6.014577705285693e-06, "loss": 0.9958, "step": 18840 }, { "epoch": 0.9413086913086913, "grad_norm": 2.0883402824401855, "learning_rate": 5.989092206534482e-06, "loss": 0.9482, "step": 18845 }, { "epoch": 0.9415584415584416, "grad_norm": 3.891770124435425, "learning_rate": 5.9636067077832715e-06, "loss": 1.0172, "step": 18850 }, { "epoch": 0.9418081918081919, "grad_norm": 1.7038673162460327, "learning_rate": 5.938121209032061e-06, "loss": 1.0273, "step": 18855 }, { "epoch": 0.942057942057942, "grad_norm": 2.001326560974121, "learning_rate": 5.912635710280851e-06, "loss": 1.0484, "step": 18860 }, { "epoch": 0.9423076923076923, "grad_norm": 1.7845869064331055, "learning_rate": 5.88715021152964e-06, "loss": 1.0692, "step": 18865 }, { "epoch": 0.9425574425574426, "grad_norm": 1.5655783414840698, "learning_rate": 5.861664712778429e-06, "loss": 1.0728, "step": 18870 }, { "epoch": 0.9428071928071928, "grad_norm": 2.6552300453186035, "learning_rate": 5.836179214027219e-06, "loss": 0.9575, "step": 18875 }, { "epoch": 0.9430569430569431, "grad_norm": 2.4825668334960938, "learning_rate": 5.8106937152760085e-06, "loss": 1.0182, "step": 18880 }, { "epoch": 0.9433066933066933, "grad_norm": 2.1464991569519043, "learning_rate": 5.785208216524797e-06, "loss": 0.9713, "step": 18885 }, { "epoch": 0.9435564435564435, "grad_norm": 2.0101311206817627, "learning_rate": 5.759722717773587e-06, "loss": 1.0376, "step": 18890 }, { "epoch": 0.9438061938061938, "grad_norm": 1.6791499853134155, "learning_rate": 5.734237219022377e-06, "loss": 0.9422, "step": 18895 }, { "epoch": 0.9440559440559441, "grad_norm": 2.133033037185669, "learning_rate": 5.708751720271166e-06, "loss": 0.9836, "step": 18900 }, { "epoch": 0.9443056943056943, "grad_norm": 2.10542368888855, "learning_rate": 5.683266221519955e-06, "loss": 1.013, "step": 18905 }, { "epoch": 0.9445554445554446, "grad_norm": 1.959383249282837, "learning_rate": 5.657780722768745e-06, "loss": 0.9842, "step": 18910 }, { "epoch": 0.9448051948051948, "grad_norm": 2.042891025543213, "learning_rate": 5.632295224017534e-06, "loss": 1.0773, "step": 18915 }, { "epoch": 0.945054945054945, "grad_norm": 1.890740990638733, "learning_rate": 5.606809725266324e-06, "loss": 0.9839, "step": 18920 }, { "epoch": 0.9453046953046953, "grad_norm": 1.5152758359909058, "learning_rate": 5.581324226515114e-06, "loss": 1.0238, "step": 18925 }, { "epoch": 0.9455544455544456, "grad_norm": 1.7638087272644043, "learning_rate": 5.555838727763902e-06, "loss": 1.0248, "step": 18930 }, { "epoch": 0.9458041958041958, "grad_norm": 1.8568116426467896, "learning_rate": 5.530353229012692e-06, "loss": 0.9482, "step": 18935 }, { "epoch": 0.9460539460539461, "grad_norm": 2.0366969108581543, "learning_rate": 5.504867730261482e-06, "loss": 1.0646, "step": 18940 }, { "epoch": 0.9463036963036963, "grad_norm": 1.6727945804595947, "learning_rate": 5.479382231510271e-06, "loss": 1.1042, "step": 18945 }, { "epoch": 0.9465534465534465, "grad_norm": 1.7207212448120117, "learning_rate": 5.45389673275906e-06, "loss": 1.0228, "step": 18950 }, { "epoch": 0.9468031968031968, "grad_norm": 1.8132790327072144, "learning_rate": 5.42841123400785e-06, "loss": 0.894, "step": 18955 }, { "epoch": 0.9470529470529471, "grad_norm": 1.975264072418213, "learning_rate": 5.402925735256639e-06, "loss": 1.0415, "step": 18960 }, { "epoch": 0.9473026973026973, "grad_norm": 2.8204216957092285, "learning_rate": 5.377440236505429e-06, "loss": 1.0326, "step": 18965 }, { "epoch": 0.9475524475524476, "grad_norm": 1.6041123867034912, "learning_rate": 5.351954737754218e-06, "loss": 1.0521, "step": 18970 }, { "epoch": 0.9478021978021978, "grad_norm": 2.1864986419677734, "learning_rate": 5.3264692390030075e-06, "loss": 1.0045, "step": 18975 }, { "epoch": 0.948051948051948, "grad_norm": 1.5278987884521484, "learning_rate": 5.300983740251797e-06, "loss": 1.0822, "step": 18980 }, { "epoch": 0.9483016983016983, "grad_norm": 1.564110517501831, "learning_rate": 5.275498241500587e-06, "loss": 1.0502, "step": 18985 }, { "epoch": 0.9485514485514486, "grad_norm": 1.7374778985977173, "learning_rate": 5.2500127427493755e-06, "loss": 0.9549, "step": 18990 }, { "epoch": 0.9488011988011988, "grad_norm": 1.6353267431259155, "learning_rate": 5.224527243998165e-06, "loss": 1.0015, "step": 18995 }, { "epoch": 0.949050949050949, "grad_norm": 1.9183955192565918, "learning_rate": 5.199041745246955e-06, "loss": 0.9302, "step": 19000 }, { "epoch": 0.9493006993006993, "grad_norm": 2.2196826934814453, "learning_rate": 5.1735562464957445e-06, "loss": 0.951, "step": 19005 }, { "epoch": 0.9495504495504495, "grad_norm": 2.0099048614501953, "learning_rate": 5.148070747744533e-06, "loss": 0.9074, "step": 19010 }, { "epoch": 0.9498001998001998, "grad_norm": 1.4550764560699463, "learning_rate": 5.122585248993323e-06, "loss": 0.993, "step": 19015 }, { "epoch": 0.9500499500499501, "grad_norm": 1.968523621559143, "learning_rate": 5.0970997502421125e-06, "loss": 0.9936, "step": 19020 }, { "epoch": 0.9502997002997003, "grad_norm": 1.8548469543457031, "learning_rate": 5.071614251490902e-06, "loss": 0.9909, "step": 19025 }, { "epoch": 0.9505494505494505, "grad_norm": 1.5858134031295776, "learning_rate": 5.046128752739691e-06, "loss": 1.0222, "step": 19030 }, { "epoch": 0.9507992007992008, "grad_norm": 1.9728139638900757, "learning_rate": 5.020643253988481e-06, "loss": 0.9491, "step": 19035 }, { "epoch": 0.951048951048951, "grad_norm": 2.830547571182251, "learning_rate": 4.99515775523727e-06, "loss": 0.9561, "step": 19040 }, { "epoch": 0.9512987012987013, "grad_norm": 2.060822010040283, "learning_rate": 4.96967225648606e-06, "loss": 1.0168, "step": 19045 }, { "epoch": 0.9515484515484516, "grad_norm": 1.6886918544769287, "learning_rate": 4.9441867577348495e-06, "loss": 1.0037, "step": 19050 }, { "epoch": 0.9517982017982018, "grad_norm": 2.0498859882354736, "learning_rate": 4.918701258983638e-06, "loss": 0.9687, "step": 19055 }, { "epoch": 0.952047952047952, "grad_norm": 1.6786259412765503, "learning_rate": 4.893215760232428e-06, "loss": 1.0367, "step": 19060 }, { "epoch": 0.9522977022977023, "grad_norm": 1.7148199081420898, "learning_rate": 4.867730261481218e-06, "loss": 1.0118, "step": 19065 }, { "epoch": 0.9525474525474525, "grad_norm": 1.735885500907898, "learning_rate": 4.842244762730007e-06, "loss": 0.9448, "step": 19070 }, { "epoch": 0.9527972027972028, "grad_norm": 2.0784859657287598, "learning_rate": 4.816759263978796e-06, "loss": 1.0241, "step": 19075 }, { "epoch": 0.9530469530469531, "grad_norm": 1.9451489448547363, "learning_rate": 4.791273765227586e-06, "loss": 1.0541, "step": 19080 }, { "epoch": 0.9532967032967034, "grad_norm": 1.6191470623016357, "learning_rate": 4.765788266476375e-06, "loss": 0.9712, "step": 19085 }, { "epoch": 0.9535464535464535, "grad_norm": 2.2786192893981934, "learning_rate": 4.740302767725165e-06, "loss": 0.9503, "step": 19090 }, { "epoch": 0.9537962037962038, "grad_norm": 1.9751181602478027, "learning_rate": 4.714817268973954e-06, "loss": 1.0473, "step": 19095 }, { "epoch": 0.954045954045954, "grad_norm": 2.223865032196045, "learning_rate": 4.689331770222743e-06, "loss": 0.9854, "step": 19100 }, { "epoch": 0.9542957042957043, "grad_norm": 1.9516491889953613, "learning_rate": 4.663846271471533e-06, "loss": 0.9702, "step": 19105 }, { "epoch": 0.9545454545454546, "grad_norm": 1.914487600326538, "learning_rate": 4.638360772720323e-06, "loss": 0.9328, "step": 19110 }, { "epoch": 0.9547952047952047, "grad_norm": 3.862971782684326, "learning_rate": 4.6128752739691115e-06, "loss": 1.1179, "step": 19115 }, { "epoch": 0.955044955044955, "grad_norm": 1.7894824743270874, "learning_rate": 4.587389775217901e-06, "loss": 1.0057, "step": 19120 }, { "epoch": 0.9552947052947053, "grad_norm": 4.797071933746338, "learning_rate": 4.561904276466691e-06, "loss": 0.9011, "step": 19125 }, { "epoch": 0.9555444555444556, "grad_norm": 1.9644567966461182, "learning_rate": 4.53641877771548e-06, "loss": 0.9474, "step": 19130 }, { "epoch": 0.9557942057942058, "grad_norm": 1.753190517425537, "learning_rate": 4.510933278964269e-06, "loss": 1.0439, "step": 19135 }, { "epoch": 0.9560439560439561, "grad_norm": 1.8479493856430054, "learning_rate": 4.485447780213059e-06, "loss": 1.0808, "step": 19140 }, { "epoch": 0.9562937062937062, "grad_norm": 1.8204654455184937, "learning_rate": 4.4599622814618485e-06, "loss": 1.0299, "step": 19145 }, { "epoch": 0.9565434565434565, "grad_norm": 1.87382173538208, "learning_rate": 4.434476782710638e-06, "loss": 1.1275, "step": 19150 }, { "epoch": 0.9567932067932068, "grad_norm": 1.571408748626709, "learning_rate": 4.408991283959427e-06, "loss": 1.1283, "step": 19155 }, { "epoch": 0.957042957042957, "grad_norm": 1.8600305318832397, "learning_rate": 4.3835057852082165e-06, "loss": 0.9607, "step": 19160 }, { "epoch": 0.9572927072927073, "grad_norm": 2.2404284477233887, "learning_rate": 4.358020286457006e-06, "loss": 1.0579, "step": 19165 }, { "epoch": 0.9575424575424576, "grad_norm": 1.392696738243103, "learning_rate": 4.332534787705796e-06, "loss": 1.0218, "step": 19170 }, { "epoch": 0.9577922077922078, "grad_norm": 2.026289701461792, "learning_rate": 4.3070492889545855e-06, "loss": 1.0753, "step": 19175 }, { "epoch": 0.958041958041958, "grad_norm": 1.5225684642791748, "learning_rate": 4.281563790203374e-06, "loss": 0.997, "step": 19180 }, { "epoch": 0.9582917082917083, "grad_norm": 1.5099003314971924, "learning_rate": 4.256078291452164e-06, "loss": 1.0699, "step": 19185 }, { "epoch": 0.9585414585414586, "grad_norm": 2.4741945266723633, "learning_rate": 4.2305927927009535e-06, "loss": 0.9817, "step": 19190 }, { "epoch": 0.9587912087912088, "grad_norm": 2.0272533893585205, "learning_rate": 4.205107293949743e-06, "loss": 0.9776, "step": 19195 }, { "epoch": 0.9590409590409591, "grad_norm": 2.015946865081787, "learning_rate": 4.179621795198532e-06, "loss": 1.0113, "step": 19200 }, { "epoch": 0.9592907092907093, "grad_norm": 1.6356379985809326, "learning_rate": 4.154136296447322e-06, "loss": 1.0134, "step": 19205 }, { "epoch": 0.9595404595404595, "grad_norm": 2.348116397857666, "learning_rate": 4.128650797696111e-06, "loss": 0.9949, "step": 19210 }, { "epoch": 0.9597902097902098, "grad_norm": 2.025428533554077, "learning_rate": 4.103165298944901e-06, "loss": 0.9737, "step": 19215 }, { "epoch": 0.9600399600399601, "grad_norm": 1.6804481744766235, "learning_rate": 4.07767980019369e-06, "loss": 1.0464, "step": 19220 }, { "epoch": 0.9602897102897103, "grad_norm": 1.7540175914764404, "learning_rate": 4.052194301442479e-06, "loss": 1.0259, "step": 19225 }, { "epoch": 0.9605394605394605, "grad_norm": 1.8846664428710938, "learning_rate": 4.026708802691269e-06, "loss": 1.0225, "step": 19230 }, { "epoch": 0.9607892107892108, "grad_norm": 1.8593484163284302, "learning_rate": 4.001223303940059e-06, "loss": 1.0886, "step": 19235 }, { "epoch": 0.961038961038961, "grad_norm": 2.807832956314087, "learning_rate": 3.975737805188847e-06, "loss": 0.9271, "step": 19240 }, { "epoch": 0.9612887112887113, "grad_norm": 1.5321229696273804, "learning_rate": 3.950252306437637e-06, "loss": 0.9443, "step": 19245 }, { "epoch": 0.9615384615384616, "grad_norm": 1.6992570161819458, "learning_rate": 3.924766807686427e-06, "loss": 0.9766, "step": 19250 }, { "epoch": 0.9617882117882118, "grad_norm": 2.173574209213257, "learning_rate": 3.899281308935216e-06, "loss": 1.0146, "step": 19255 }, { "epoch": 0.962037962037962, "grad_norm": 2.8477325439453125, "learning_rate": 3.873795810184005e-06, "loss": 1.0951, "step": 19260 }, { "epoch": 0.9622877122877123, "grad_norm": 1.6449047327041626, "learning_rate": 3.848310311432795e-06, "loss": 0.979, "step": 19265 }, { "epoch": 0.9625374625374625, "grad_norm": 1.9028947353363037, "learning_rate": 3.822824812681584e-06, "loss": 0.94, "step": 19270 }, { "epoch": 0.9627872127872128, "grad_norm": 1.7588454484939575, "learning_rate": 3.7973393139303736e-06, "loss": 0.9578, "step": 19275 }, { "epoch": 0.9630369630369631, "grad_norm": 1.7416634559631348, "learning_rate": 3.7718538151791633e-06, "loss": 1.0162, "step": 19280 }, { "epoch": 0.9632867132867133, "grad_norm": 2.139782667160034, "learning_rate": 3.746368316427953e-06, "loss": 1.0925, "step": 19285 }, { "epoch": 0.9635364635364635, "grad_norm": 1.8083499670028687, "learning_rate": 3.720882817676742e-06, "loss": 1.0104, "step": 19290 }, { "epoch": 0.9637862137862138, "grad_norm": 1.5339778661727905, "learning_rate": 3.6953973189255318e-06, "loss": 0.932, "step": 19295 }, { "epoch": 0.964035964035964, "grad_norm": 2.5419790744781494, "learning_rate": 3.669911820174321e-06, "loss": 1.0053, "step": 19300 }, { "epoch": 0.9642857142857143, "grad_norm": 1.6162371635437012, "learning_rate": 3.6444263214231106e-06, "loss": 1.0453, "step": 19305 }, { "epoch": 0.9645354645354646, "grad_norm": 1.50950026512146, "learning_rate": 3.6189408226719e-06, "loss": 1.0122, "step": 19310 }, { "epoch": 0.9647852147852148, "grad_norm": 1.8654299974441528, "learning_rate": 3.5934553239206895e-06, "loss": 0.932, "step": 19315 }, { "epoch": 0.965034965034965, "grad_norm": 2.1115314960479736, "learning_rate": 3.5679698251694787e-06, "loss": 1.0025, "step": 19320 }, { "epoch": 0.9652847152847153, "grad_norm": 2.2970571517944336, "learning_rate": 3.5424843264182683e-06, "loss": 0.9772, "step": 19325 }, { "epoch": 0.9655344655344655, "grad_norm": 1.549383282661438, "learning_rate": 3.5169988276670575e-06, "loss": 1.0037, "step": 19330 }, { "epoch": 0.9657842157842158, "grad_norm": 1.8178656101226807, "learning_rate": 3.491513328915847e-06, "loss": 1.0179, "step": 19335 }, { "epoch": 0.9660339660339661, "grad_norm": 3.291788339614868, "learning_rate": 3.4660278301646364e-06, "loss": 0.9806, "step": 19340 }, { "epoch": 0.9662837162837162, "grad_norm": 1.6074204444885254, "learning_rate": 3.440542331413426e-06, "loss": 0.9041, "step": 19345 }, { "epoch": 0.9665334665334665, "grad_norm": 1.5580962896347046, "learning_rate": 3.4150568326622153e-06, "loss": 1.0611, "step": 19350 }, { "epoch": 0.9667832167832168, "grad_norm": 2.241830587387085, "learning_rate": 3.389571333911005e-06, "loss": 1.0525, "step": 19355 }, { "epoch": 0.967032967032967, "grad_norm": 1.7117940187454224, "learning_rate": 3.364085835159794e-06, "loss": 1.0615, "step": 19360 }, { "epoch": 0.9672827172827173, "grad_norm": 2.1533820629119873, "learning_rate": 3.3386003364085838e-06, "loss": 1.0157, "step": 19365 }, { "epoch": 0.9675324675324676, "grad_norm": 1.8521478176116943, "learning_rate": 3.313114837657373e-06, "loss": 1.0505, "step": 19370 }, { "epoch": 0.9677822177822177, "grad_norm": 1.9734584093093872, "learning_rate": 3.2876293389061626e-06, "loss": 1.0303, "step": 19375 }, { "epoch": 0.968031968031968, "grad_norm": 2.9837841987609863, "learning_rate": 3.262143840154952e-06, "loss": 1.0012, "step": 19380 }, { "epoch": 0.9682817182817183, "grad_norm": 1.842237949371338, "learning_rate": 3.2366583414037415e-06, "loss": 0.9258, "step": 19385 }, { "epoch": 0.9685314685314685, "grad_norm": 1.6294020414352417, "learning_rate": 3.2111728426525307e-06, "loss": 1.0086, "step": 19390 }, { "epoch": 0.9687812187812188, "grad_norm": 1.9204133749008179, "learning_rate": 3.1856873439013203e-06, "loss": 1.0753, "step": 19395 }, { "epoch": 0.9690309690309691, "grad_norm": 1.7796682119369507, "learning_rate": 3.1602018451501095e-06, "loss": 1.0146, "step": 19400 }, { "epoch": 0.9692807192807192, "grad_norm": 1.721350073814392, "learning_rate": 3.134716346398899e-06, "loss": 1.0511, "step": 19405 }, { "epoch": 0.9695304695304695, "grad_norm": 2.4217476844787598, "learning_rate": 3.109230847647689e-06, "loss": 0.9495, "step": 19410 }, { "epoch": 0.9697802197802198, "grad_norm": 1.6207143068313599, "learning_rate": 3.083745348896478e-06, "loss": 0.9644, "step": 19415 }, { "epoch": 0.97002997002997, "grad_norm": 1.8446784019470215, "learning_rate": 3.0582598501452677e-06, "loss": 1.0213, "step": 19420 }, { "epoch": 0.9702797202797203, "grad_norm": 1.898337483406067, "learning_rate": 3.032774351394057e-06, "loss": 1.0266, "step": 19425 }, { "epoch": 0.9705294705294706, "grad_norm": 1.6785715818405151, "learning_rate": 3.0072888526428465e-06, "loss": 1.0486, "step": 19430 }, { "epoch": 0.9707792207792207, "grad_norm": 1.6511374711990356, "learning_rate": 2.9818033538916358e-06, "loss": 1.1185, "step": 19435 }, { "epoch": 0.971028971028971, "grad_norm": 2.3202567100524902, "learning_rate": 2.9563178551404254e-06, "loss": 1.168, "step": 19440 }, { "epoch": 0.9712787212787213, "grad_norm": 2.948145627975464, "learning_rate": 2.9308323563892146e-06, "loss": 1.1017, "step": 19445 }, { "epoch": 0.9715284715284715, "grad_norm": 2.4074318408966064, "learning_rate": 2.9053468576380043e-06, "loss": 1.0513, "step": 19450 }, { "epoch": 0.9717782217782218, "grad_norm": 2.246199131011963, "learning_rate": 2.8798613588867935e-06, "loss": 1.1424, "step": 19455 }, { "epoch": 0.972027972027972, "grad_norm": 1.8494129180908203, "learning_rate": 2.854375860135583e-06, "loss": 1.0119, "step": 19460 }, { "epoch": 0.9722777222777222, "grad_norm": 1.5825841426849365, "learning_rate": 2.8288903613843723e-06, "loss": 1.0674, "step": 19465 }, { "epoch": 0.9725274725274725, "grad_norm": 1.972778558731079, "learning_rate": 2.803404862633162e-06, "loss": 0.9271, "step": 19470 }, { "epoch": 0.9727772227772228, "grad_norm": 1.9893372058868408, "learning_rate": 2.777919363881951e-06, "loss": 0.9069, "step": 19475 }, { "epoch": 0.973026973026973, "grad_norm": 1.9189273118972778, "learning_rate": 2.752433865130741e-06, "loss": 0.9133, "step": 19480 }, { "epoch": 0.9732767232767233, "grad_norm": 2.1763763427734375, "learning_rate": 2.72694836637953e-06, "loss": 0.9886, "step": 19485 }, { "epoch": 0.9735264735264735, "grad_norm": 2.1150147914886475, "learning_rate": 2.7014628676283197e-06, "loss": 1.0639, "step": 19490 }, { "epoch": 0.9737762237762237, "grad_norm": 2.3204472064971924, "learning_rate": 2.675977368877109e-06, "loss": 1.1072, "step": 19495 }, { "epoch": 0.974025974025974, "grad_norm": 1.8466722965240479, "learning_rate": 2.6504918701258985e-06, "loss": 1.048, "step": 19500 }, { "epoch": 0.9742757242757243, "grad_norm": 1.6079705953598022, "learning_rate": 2.6250063713746878e-06, "loss": 0.8329, "step": 19505 }, { "epoch": 0.9745254745254746, "grad_norm": 1.5413941144943237, "learning_rate": 2.5995208726234774e-06, "loss": 0.998, "step": 19510 }, { "epoch": 0.9747752247752248, "grad_norm": 2.1617441177368164, "learning_rate": 2.5740353738722666e-06, "loss": 0.9655, "step": 19515 }, { "epoch": 0.975024975024975, "grad_norm": 2.5063273906707764, "learning_rate": 2.5485498751210563e-06, "loss": 1.0877, "step": 19520 }, { "epoch": 0.9752747252747253, "grad_norm": 2.1627862453460693, "learning_rate": 2.5230643763698455e-06, "loss": 0.9962, "step": 19525 }, { "epoch": 0.9755244755244755, "grad_norm": 1.7253419160842896, "learning_rate": 2.497578877618635e-06, "loss": 1.0373, "step": 19530 }, { "epoch": 0.9757742257742258, "grad_norm": 2.0045435428619385, "learning_rate": 2.4720933788674248e-06, "loss": 0.9769, "step": 19535 }, { "epoch": 0.9760239760239761, "grad_norm": 2.3594048023223877, "learning_rate": 2.446607880116214e-06, "loss": 1.0796, "step": 19540 }, { "epoch": 0.9762737262737263, "grad_norm": 1.5950548648834229, "learning_rate": 2.4211223813650036e-06, "loss": 0.9571, "step": 19545 }, { "epoch": 0.9765234765234765, "grad_norm": 1.6765066385269165, "learning_rate": 2.395636882613793e-06, "loss": 0.9187, "step": 19550 }, { "epoch": 0.9767732267732268, "grad_norm": 2.0699381828308105, "learning_rate": 2.3701513838625825e-06, "loss": 1.0075, "step": 19555 }, { "epoch": 0.977022977022977, "grad_norm": 1.8824483156204224, "learning_rate": 2.3446658851113717e-06, "loss": 1.0175, "step": 19560 }, { "epoch": 0.9772727272727273, "grad_norm": 1.660109043121338, "learning_rate": 2.3191803863601613e-06, "loss": 0.9547, "step": 19565 }, { "epoch": 0.9775224775224776, "grad_norm": 2.276287794113159, "learning_rate": 2.2936948876089506e-06, "loss": 1.0865, "step": 19570 }, { "epoch": 0.9777722277722277, "grad_norm": 2.5897884368896484, "learning_rate": 2.26820938885774e-06, "loss": 0.9957, "step": 19575 }, { "epoch": 0.978021978021978, "grad_norm": 1.6370633840560913, "learning_rate": 2.2427238901065294e-06, "loss": 0.9744, "step": 19580 }, { "epoch": 0.9782717282717283, "grad_norm": 2.071108341217041, "learning_rate": 2.217238391355319e-06, "loss": 1.1419, "step": 19585 }, { "epoch": 0.9785214785214785, "grad_norm": 1.9755899906158447, "learning_rate": 2.1917528926041083e-06, "loss": 1.0662, "step": 19590 }, { "epoch": 0.9787712287712288, "grad_norm": 2.042848587036133, "learning_rate": 2.166267393852898e-06, "loss": 1.0122, "step": 19595 }, { "epoch": 0.9790209790209791, "grad_norm": 1.708868384361267, "learning_rate": 2.140781895101687e-06, "loss": 0.9722, "step": 19600 }, { "epoch": 0.9792707292707292, "grad_norm": 2.1592464447021484, "learning_rate": 2.1152963963504768e-06, "loss": 1.0515, "step": 19605 }, { "epoch": 0.9795204795204795, "grad_norm": 1.9306093454360962, "learning_rate": 2.089810897599266e-06, "loss": 1.016, "step": 19610 }, { "epoch": 0.9797702297702298, "grad_norm": 1.5760836601257324, "learning_rate": 2.0643253988480556e-06, "loss": 1.0194, "step": 19615 }, { "epoch": 0.98001998001998, "grad_norm": 2.2680790424346924, "learning_rate": 2.038839900096845e-06, "loss": 1.0216, "step": 19620 }, { "epoch": 0.9802697302697303, "grad_norm": 1.909187912940979, "learning_rate": 2.0133544013456345e-06, "loss": 1.016, "step": 19625 }, { "epoch": 0.9805194805194806, "grad_norm": 2.057035207748413, "learning_rate": 1.9878689025944237e-06, "loss": 1.0895, "step": 19630 }, { "epoch": 0.9807692307692307, "grad_norm": 1.8652408123016357, "learning_rate": 1.9623834038432133e-06, "loss": 0.9584, "step": 19635 }, { "epoch": 0.981018981018981, "grad_norm": 1.8059011697769165, "learning_rate": 1.9368979050920026e-06, "loss": 1.1051, "step": 19640 }, { "epoch": 0.9812687312687313, "grad_norm": 2.1325511932373047, "learning_rate": 1.911412406340792e-06, "loss": 1.0637, "step": 19645 }, { "epoch": 0.9815184815184815, "grad_norm": 1.8671185970306396, "learning_rate": 1.8859269075895816e-06, "loss": 1.0288, "step": 19650 }, { "epoch": 0.9817682317682318, "grad_norm": 1.7622582912445068, "learning_rate": 1.860441408838371e-06, "loss": 1.0116, "step": 19655 }, { "epoch": 0.9820179820179821, "grad_norm": 1.8095593452453613, "learning_rate": 1.8349559100871605e-06, "loss": 0.9925, "step": 19660 }, { "epoch": 0.9822677322677322, "grad_norm": 1.7579630613327026, "learning_rate": 1.80947041133595e-06, "loss": 0.9627, "step": 19665 }, { "epoch": 0.9825174825174825, "grad_norm": 2.092794179916382, "learning_rate": 1.7839849125847393e-06, "loss": 0.9571, "step": 19670 }, { "epoch": 0.9827672327672328, "grad_norm": 1.6793092489242554, "learning_rate": 1.7584994138335288e-06, "loss": 0.9186, "step": 19675 }, { "epoch": 0.983016983016983, "grad_norm": 1.5173344612121582, "learning_rate": 1.7330139150823182e-06, "loss": 1.0003, "step": 19680 }, { "epoch": 0.9832667332667333, "grad_norm": 2.119503974914551, "learning_rate": 1.7075284163311076e-06, "loss": 1.1301, "step": 19685 }, { "epoch": 0.9835164835164835, "grad_norm": 2.237276077270508, "learning_rate": 1.682042917579897e-06, "loss": 1.007, "step": 19690 }, { "epoch": 0.9837662337662337, "grad_norm": 1.997570514678955, "learning_rate": 1.6565574188286865e-06, "loss": 1.037, "step": 19695 }, { "epoch": 0.984015984015984, "grad_norm": 2.447059154510498, "learning_rate": 1.631071920077476e-06, "loss": 1.0176, "step": 19700 }, { "epoch": 0.9842657342657343, "grad_norm": 1.9173567295074463, "learning_rate": 1.6055864213262653e-06, "loss": 1.0308, "step": 19705 }, { "epoch": 0.9845154845154845, "grad_norm": 1.9106783866882324, "learning_rate": 1.5801009225750548e-06, "loss": 0.836, "step": 19710 }, { "epoch": 0.9847652347652348, "grad_norm": 2.590735673904419, "learning_rate": 1.5546154238238444e-06, "loss": 1.0188, "step": 19715 }, { "epoch": 0.985014985014985, "grad_norm": 2.1768710613250732, "learning_rate": 1.5291299250726338e-06, "loss": 1.0612, "step": 19720 }, { "epoch": 0.9852647352647352, "grad_norm": 1.8687500953674316, "learning_rate": 1.5036444263214233e-06, "loss": 1.0582, "step": 19725 }, { "epoch": 0.9855144855144855, "grad_norm": 1.8283836841583252, "learning_rate": 1.4781589275702127e-06, "loss": 0.9479, "step": 19730 }, { "epoch": 0.9857642357642358, "grad_norm": 1.9325308799743652, "learning_rate": 1.4526734288190021e-06, "loss": 0.9691, "step": 19735 }, { "epoch": 0.986013986013986, "grad_norm": 1.6064478158950806, "learning_rate": 1.4271879300677916e-06, "loss": 1.0117, "step": 19740 }, { "epoch": 0.9862637362637363, "grad_norm": 1.4492825269699097, "learning_rate": 1.401702431316581e-06, "loss": 0.979, "step": 19745 }, { "epoch": 0.9865134865134865, "grad_norm": 2.0389037132263184, "learning_rate": 1.3762169325653704e-06, "loss": 0.9758, "step": 19750 }, { "epoch": 0.9867632367632367, "grad_norm": 2.0293426513671875, "learning_rate": 1.3507314338141598e-06, "loss": 0.9235, "step": 19755 }, { "epoch": 0.987012987012987, "grad_norm": 2.00239634513855, "learning_rate": 1.3252459350629493e-06, "loss": 1.0774, "step": 19760 }, { "epoch": 0.9872627372627373, "grad_norm": 1.9364699125289917, "learning_rate": 1.2997604363117387e-06, "loss": 1.1161, "step": 19765 }, { "epoch": 0.9875124875124875, "grad_norm": 1.907851219177246, "learning_rate": 1.2742749375605281e-06, "loss": 1.2067, "step": 19770 }, { "epoch": 0.9877622377622378, "grad_norm": 2.0145368576049805, "learning_rate": 1.2487894388093176e-06, "loss": 0.893, "step": 19775 }, { "epoch": 0.988011988011988, "grad_norm": 1.6930556297302246, "learning_rate": 1.223303940058107e-06, "loss": 0.9626, "step": 19780 }, { "epoch": 0.9882617382617382, "grad_norm": 2.4282970428466797, "learning_rate": 1.1978184413068964e-06, "loss": 1.049, "step": 19785 }, { "epoch": 0.9885114885114885, "grad_norm": 2.2468302249908447, "learning_rate": 1.1723329425556858e-06, "loss": 0.9447, "step": 19790 }, { "epoch": 0.9887612387612388, "grad_norm": 1.9117259979248047, "learning_rate": 1.1468474438044753e-06, "loss": 0.9514, "step": 19795 }, { "epoch": 0.989010989010989, "grad_norm": 2.2472829818725586, "learning_rate": 1.1213619450532647e-06, "loss": 0.9276, "step": 19800 }, { "epoch": 0.9892607392607392, "grad_norm": 1.716097354888916, "learning_rate": 1.0958764463020541e-06, "loss": 1.0453, "step": 19805 }, { "epoch": 0.9895104895104895, "grad_norm": 1.8103221654891968, "learning_rate": 1.0703909475508436e-06, "loss": 0.9873, "step": 19810 }, { "epoch": 0.9897602397602397, "grad_norm": 1.9715547561645508, "learning_rate": 1.044905448799633e-06, "loss": 0.9727, "step": 19815 }, { "epoch": 0.99000999000999, "grad_norm": 1.8063431978225708, "learning_rate": 1.0194199500484224e-06, "loss": 0.9818, "step": 19820 }, { "epoch": 0.9902597402597403, "grad_norm": 3.6261048316955566, "learning_rate": 9.939344512972119e-07, "loss": 1.0226, "step": 19825 }, { "epoch": 0.9905094905094906, "grad_norm": 2.428201198577881, "learning_rate": 9.684489525460013e-07, "loss": 0.9959, "step": 19830 }, { "epoch": 0.9907592407592407, "grad_norm": 2.520509958267212, "learning_rate": 9.429634537947908e-07, "loss": 1.0499, "step": 19835 }, { "epoch": 0.991008991008991, "grad_norm": 1.8354647159576416, "learning_rate": 9.174779550435802e-07, "loss": 1.0288, "step": 19840 }, { "epoch": 0.9912587412587412, "grad_norm": 1.9098531007766724, "learning_rate": 8.919924562923697e-07, "loss": 0.998, "step": 19845 }, { "epoch": 0.9915084915084915, "grad_norm": 1.84784734249115, "learning_rate": 8.665069575411591e-07, "loss": 0.9246, "step": 19850 }, { "epoch": 0.9917582417582418, "grad_norm": 1.998331904411316, "learning_rate": 8.410214587899485e-07, "loss": 0.9341, "step": 19855 }, { "epoch": 0.9920079920079921, "grad_norm": 1.8428945541381836, "learning_rate": 8.15535960038738e-07, "loss": 0.9848, "step": 19860 }, { "epoch": 0.9922577422577422, "grad_norm": 1.6971213817596436, "learning_rate": 7.900504612875274e-07, "loss": 1.0297, "step": 19865 }, { "epoch": 0.9925074925074925, "grad_norm": 1.5451939105987549, "learning_rate": 7.645649625363169e-07, "loss": 1.022, "step": 19870 }, { "epoch": 0.9927572427572428, "grad_norm": 1.72762930393219, "learning_rate": 7.390794637851064e-07, "loss": 1.0774, "step": 19875 }, { "epoch": 0.993006993006993, "grad_norm": 2.0969457626342773, "learning_rate": 7.135939650338958e-07, "loss": 1.0542, "step": 19880 }, { "epoch": 0.9932567432567433, "grad_norm": 1.6009266376495361, "learning_rate": 6.881084662826852e-07, "loss": 1.0305, "step": 19885 }, { "epoch": 0.9935064935064936, "grad_norm": 1.6609162092208862, "learning_rate": 6.626229675314746e-07, "loss": 1.0139, "step": 19890 }, { "epoch": 0.9937562437562437, "grad_norm": 2.324324131011963, "learning_rate": 6.371374687802641e-07, "loss": 0.9906, "step": 19895 }, { "epoch": 0.994005994005994, "grad_norm": 2.4590704441070557, "learning_rate": 6.116519700290535e-07, "loss": 0.9756, "step": 19900 }, { "epoch": 0.9942557442557443, "grad_norm": 2.641580104827881, "learning_rate": 5.861664712778429e-07, "loss": 0.9, "step": 19905 }, { "epoch": 0.9945054945054945, "grad_norm": 2.2611355781555176, "learning_rate": 5.606809725266324e-07, "loss": 0.9463, "step": 19910 }, { "epoch": 0.9947552447552448, "grad_norm": 2.054806709289551, "learning_rate": 5.351954737754218e-07, "loss": 1.037, "step": 19915 }, { "epoch": 0.995004995004995, "grad_norm": 2.0479061603546143, "learning_rate": 5.097099750242112e-07, "loss": 1.0423, "step": 19920 }, { "epoch": 0.9952547452547452, "grad_norm": 1.893660306930542, "learning_rate": 4.842244762730006e-07, "loss": 0.961, "step": 19925 }, { "epoch": 0.9955044955044955, "grad_norm": 1.8999965190887451, "learning_rate": 4.587389775217901e-07, "loss": 0.9296, "step": 19930 }, { "epoch": 0.9957542457542458, "grad_norm": 2.2480978965759277, "learning_rate": 4.3325347877057955e-07, "loss": 0.9515, "step": 19935 }, { "epoch": 0.996003996003996, "grad_norm": 1.8258984088897705, "learning_rate": 4.07767980019369e-07, "loss": 0.9633, "step": 19940 }, { "epoch": 0.9962537462537463, "grad_norm": 1.7698311805725098, "learning_rate": 3.8228248126815846e-07, "loss": 0.8889, "step": 19945 }, { "epoch": 0.9965034965034965, "grad_norm": 1.6415377855300903, "learning_rate": 3.567969825169479e-07, "loss": 0.9437, "step": 19950 }, { "epoch": 0.9967532467532467, "grad_norm": 2.2971489429473877, "learning_rate": 3.313114837657373e-07, "loss": 1.0407, "step": 19955 }, { "epoch": 0.997002997002997, "grad_norm": 1.8327277898788452, "learning_rate": 3.0582598501452675e-07, "loss": 0.9389, "step": 19960 }, { "epoch": 0.9972527472527473, "grad_norm": 2.320667028427124, "learning_rate": 2.803404862633162e-07, "loss": 0.9924, "step": 19965 }, { "epoch": 0.9975024975024975, "grad_norm": 2.5481178760528564, "learning_rate": 2.548549875121056e-07, "loss": 1.0146, "step": 19970 }, { "epoch": 0.9977522477522478, "grad_norm": 1.8971010446548462, "learning_rate": 2.2936948876089506e-07, "loss": 0.9947, "step": 19975 }, { "epoch": 0.998001998001998, "grad_norm": 2.0232813358306885, "learning_rate": 2.038839900096845e-07, "loss": 1.0058, "step": 19980 }, { "epoch": 0.9982517482517482, "grad_norm": 1.670681357383728, "learning_rate": 1.7839849125847394e-07, "loss": 0.9961, "step": 19985 }, { "epoch": 0.9985014985014985, "grad_norm": 1.688429355621338, "learning_rate": 1.5291299250726337e-07, "loss": 1.021, "step": 19990 }, { "epoch": 0.9987512487512488, "grad_norm": 1.6594581604003906, "learning_rate": 1.274274937560528e-07, "loss": 0.9472, "step": 19995 }, { "epoch": 0.999000999000999, "grad_norm": 1.863791584968567, "learning_rate": 1.0194199500484224e-07, "loss": 1.0432, "step": 20000 }, { "epoch": 0.9992507492507493, "grad_norm": 2.2091052532196045, "learning_rate": 7.645649625363169e-08, "loss": 1.1233, "step": 20005 }, { "epoch": 0.9995004995004995, "grad_norm": 1.881324052810669, "learning_rate": 5.097099750242112e-08, "loss": 1.0248, "step": 20010 }, { "epoch": 0.9997502497502497, "grad_norm": 2.003769636154175, "learning_rate": 2.548549875121056e-08, "loss": 0.9784, "step": 20015 }, { "epoch": 1.0, "grad_norm": 2.0593271255493164, "learning_rate": 0.0, "loss": 1.0128, "step": 20020 }, { "epoch": 1.0, "step": 20020, "total_flos": 3.281440727517836e+19, "train_loss": 1.0351451178768893, "train_runtime": 41238.2253, "train_samples_per_second": 15.534, "train_steps_per_second": 0.485 } ], "logging_steps": 5, "max_steps": 20020, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.281440727517836e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }