| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9973935708079931, | |
| "eval_steps": 500, | |
| "global_step": 287, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0034752389226759338, | |
| "grad_norm": 13.863444328308105, | |
| "learning_rate": 2.222222222222222e-06, | |
| "loss": 0.8359, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0069504778453518675, | |
| "grad_norm": 12.374430656433105, | |
| "learning_rate": 4.444444444444444e-06, | |
| "loss": 0.8862, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.010425716768027803, | |
| "grad_norm": 11.986786842346191, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.7899, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.013900955690703735, | |
| "grad_norm": 6.093581199645996, | |
| "learning_rate": 8.888888888888888e-06, | |
| "loss": 0.7342, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.01737619461337967, | |
| "grad_norm": 2.471663475036621, | |
| "learning_rate": 1.1111111111111113e-05, | |
| "loss": 0.6759, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.020851433536055605, | |
| "grad_norm": 2.6024749279022217, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 0.6608, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.02432667245873154, | |
| "grad_norm": 2.211730718612671, | |
| "learning_rate": 1.555555555555556e-05, | |
| "loss": 0.5929, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.02780191138140747, | |
| "grad_norm": 2.3514232635498047, | |
| "learning_rate": 1.7777777777777777e-05, | |
| "loss": 0.6591, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.03127715030408341, | |
| "grad_norm": 2.4454445838928223, | |
| "learning_rate": 2e-05, | |
| "loss": 0.6129, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.03475238922675934, | |
| "grad_norm": 3.566322088241577, | |
| "learning_rate": 1.9999361478484043e-05, | |
| "loss": 0.6247, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.038227628149435276, | |
| "grad_norm": 2.7100212574005127, | |
| "learning_rate": 1.999744599547812e-05, | |
| "loss": 0.5985, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.04170286707211121, | |
| "grad_norm": 2.569108247756958, | |
| "learning_rate": 1.999425379559765e-05, | |
| "loss": 0.6277, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.045178105994787145, | |
| "grad_norm": 2.111241102218628, | |
| "learning_rate": 1.9989785286500294e-05, | |
| "loss": 0.6268, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.04865334491746308, | |
| "grad_norm": 2.4817757606506348, | |
| "learning_rate": 1.99840410388339e-05, | |
| "loss": 0.6465, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.052128583840139006, | |
| "grad_norm": 1.7389640808105469, | |
| "learning_rate": 1.99770217861636e-05, | |
| "loss": 0.6048, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.05560382276281494, | |
| "grad_norm": 1.7822411060333252, | |
| "learning_rate": 1.9968728424878178e-05, | |
| "loss": 0.6343, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.059079061685490875, | |
| "grad_norm": 1.7938050031661987, | |
| "learning_rate": 1.9959162014075553e-05, | |
| "loss": 0.6308, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.06255430060816682, | |
| "grad_norm": 1.7255632877349854, | |
| "learning_rate": 1.994832377542755e-05, | |
| "loss": 0.6382, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.06602953953084274, | |
| "grad_norm": 1.6459022760391235, | |
| "learning_rate": 1.9936215093023884e-05, | |
| "loss": 0.5864, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.06950477845351868, | |
| "grad_norm": 1.5717964172363281, | |
| "learning_rate": 1.9922837513195406e-05, | |
| "loss": 0.6014, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.07298001737619461, | |
| "grad_norm": 1.4626662731170654, | |
| "learning_rate": 1.990819274431662e-05, | |
| "loss": 0.5919, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.07645525629887055, | |
| "grad_norm": 1.5188833475112915, | |
| "learning_rate": 1.989228265658754e-05, | |
| "loss": 0.6285, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.07993049522154648, | |
| "grad_norm": 1.4063705205917358, | |
| "learning_rate": 1.9875109281794828e-05, | |
| "loss": 0.6498, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.08340573414422242, | |
| "grad_norm": 1.5133353471755981, | |
| "learning_rate": 1.9856674813052345e-05, | |
| "loss": 0.6267, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.08688097306689835, | |
| "grad_norm": 1.294826865196228, | |
| "learning_rate": 1.9836981604521077e-05, | |
| "loss": 0.6798, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.09035621198957429, | |
| "grad_norm": 1.4135065078735352, | |
| "learning_rate": 1.98160321711085e-05, | |
| "loss": 0.6478, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.09383145091225022, | |
| "grad_norm": 1.3046231269836426, | |
| "learning_rate": 1.9793829188147406e-05, | |
| "loss": 0.6136, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.09730668983492616, | |
| "grad_norm": 1.2951562404632568, | |
| "learning_rate": 1.9770375491054264e-05, | |
| "loss": 0.6365, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.10078192875760209, | |
| "grad_norm": 1.2141485214233398, | |
| "learning_rate": 1.974567407496712e-05, | |
| "loss": 0.6168, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.10425716768027801, | |
| "grad_norm": 1.267811894416809, | |
| "learning_rate": 1.9719728094363103e-05, | |
| "loss": 0.6212, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.10773240660295395, | |
| "grad_norm": 1.3066190481185913, | |
| "learning_rate": 1.9692540862655587e-05, | |
| "loss": 0.6246, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.11120764552562988, | |
| "grad_norm": 1.3221654891967773, | |
| "learning_rate": 1.966411585177105e-05, | |
| "loss": 0.585, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.11468288444830582, | |
| "grad_norm": 1.2844849824905396, | |
| "learning_rate": 1.9634456691705705e-05, | |
| "loss": 0.6416, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.11815812337098175, | |
| "grad_norm": 1.2656630277633667, | |
| "learning_rate": 1.9603567170061918e-05, | |
| "loss": 0.6112, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.12163336229365769, | |
| "grad_norm": 1.2453721761703491, | |
| "learning_rate": 1.9571451231564523e-05, | |
| "loss": 0.6193, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.12510860121633363, | |
| "grad_norm": 1.3584957122802734, | |
| "learning_rate": 1.9538112977557077e-05, | |
| "loss": 0.661, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.12858384013900956, | |
| "grad_norm": 1.2529661655426025, | |
| "learning_rate": 1.9503556665478066e-05, | |
| "loss": 0.626, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.13205907906168549, | |
| "grad_norm": 1.168838381767273, | |
| "learning_rate": 1.9467786708317257e-05, | |
| "loss": 0.6553, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.1355343179843614, | |
| "grad_norm": 1.266099452972412, | |
| "learning_rate": 1.9430807674052092e-05, | |
| "loss": 0.6105, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.13900955690703737, | |
| "grad_norm": 1.2240427732467651, | |
| "learning_rate": 1.939262428506438e-05, | |
| "loss": 0.6362, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1424847958297133, | |
| "grad_norm": 1.181288480758667, | |
| "learning_rate": 1.9353241417537216e-05, | |
| "loss": 0.639, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.14596003475238922, | |
| "grad_norm": 1.1083927154541016, | |
| "learning_rate": 1.9312664100832236e-05, | |
| "loss": 0.6421, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.14943527367506515, | |
| "grad_norm": 1.2228623628616333, | |
| "learning_rate": 1.9270897516847406e-05, | |
| "loss": 0.5897, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.1529105125977411, | |
| "grad_norm": 1.215945839881897, | |
| "learning_rate": 1.9227946999355226e-05, | |
| "loss": 0.6122, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.15638575152041703, | |
| "grad_norm": 1.1623311042785645, | |
| "learning_rate": 1.9183818033321612e-05, | |
| "loss": 0.5546, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.15986099044309296, | |
| "grad_norm": 1.3293700218200684, | |
| "learning_rate": 1.9138516254205416e-05, | |
| "loss": 0.6268, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.1633362293657689, | |
| "grad_norm": 1.1984280347824097, | |
| "learning_rate": 1.9092047447238775e-05, | |
| "loss": 0.6081, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.16681146828844484, | |
| "grad_norm": 1.1898068189620972, | |
| "learning_rate": 1.9044417546688295e-05, | |
| "loss": 0.6007, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.17028670721112077, | |
| "grad_norm": 1.2403992414474487, | |
| "learning_rate": 1.899563263509725e-05, | |
| "loss": 0.6219, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.1737619461337967, | |
| "grad_norm": 1.3231101036071777, | |
| "learning_rate": 1.894569894250877e-05, | |
| "loss": 0.5834, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.17723718505647262, | |
| "grad_norm": 1.3285619020462036, | |
| "learning_rate": 1.8894622845670282e-05, | |
| "loss": 0.6568, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.18071242397914858, | |
| "grad_norm": 1.1555180549621582, | |
| "learning_rate": 1.8842410867219137e-05, | |
| "loss": 0.6269, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.1841876629018245, | |
| "grad_norm": 1.1796412467956543, | |
| "learning_rate": 1.878906967484966e-05, | |
| "loss": 0.5973, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.18766290182450043, | |
| "grad_norm": 1.144237756729126, | |
| "learning_rate": 1.8734606080461657e-05, | |
| "loss": 0.6367, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.19113814074717636, | |
| "grad_norm": 1.1937980651855469, | |
| "learning_rate": 1.86790270392905e-05, | |
| "loss": 0.6006, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.19461337966985232, | |
| "grad_norm": 1.0895384550094604, | |
| "learning_rate": 1.8622339649018907e-05, | |
| "loss": 0.579, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.19808861859252824, | |
| "grad_norm": 1.1548895835876465, | |
| "learning_rate": 1.856455114887056e-05, | |
| "loss": 0.5521, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.20156385751520417, | |
| "grad_norm": 1.0904297828674316, | |
| "learning_rate": 1.8505668918685603e-05, | |
| "loss": 0.6222, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.2050390964378801, | |
| "grad_norm": 1.2059494256973267, | |
| "learning_rate": 1.8445700477978207e-05, | |
| "loss": 0.6631, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.20851433536055602, | |
| "grad_norm": 1.1830602884292603, | |
| "learning_rate": 1.8384653484976305e-05, | |
| "loss": 0.5963, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.21198957428323198, | |
| "grad_norm": 1.2046791315078735, | |
| "learning_rate": 1.8322535735643604e-05, | |
| "loss": 0.5943, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.2154648132059079, | |
| "grad_norm": 1.1545571088790894, | |
| "learning_rate": 1.8259355162684e-05, | |
| "loss": 0.6301, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.21894005212858383, | |
| "grad_norm": 1.0760880708694458, | |
| "learning_rate": 1.8195119834528535e-05, | |
| "loss": 0.6366, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.22241529105125976, | |
| "grad_norm": 1.2411205768585205, | |
| "learning_rate": 1.8129837954305033e-05, | |
| "loss": 0.6156, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.22589052997393572, | |
| "grad_norm": 1.1241486072540283, | |
| "learning_rate": 1.8063517858790517e-05, | |
| "loss": 0.608, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.22936576889661164, | |
| "grad_norm": 1.0839852094650269, | |
| "learning_rate": 1.799616801734657e-05, | |
| "loss": 0.6334, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.23284100781928757, | |
| "grad_norm": 1.125570297241211, | |
| "learning_rate": 1.792779703083777e-05, | |
| "loss": 0.6139, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.2363162467419635, | |
| "grad_norm": 1.1417044401168823, | |
| "learning_rate": 1.7858413630533305e-05, | |
| "loss": 0.5897, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.23979148566463945, | |
| "grad_norm": 1.0312554836273193, | |
| "learning_rate": 1.778802667699196e-05, | |
| "loss": 0.606, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.24326672458731538, | |
| "grad_norm": 1.0954256057739258, | |
| "learning_rate": 1.77166451589306e-05, | |
| "loss": 0.5888, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2467419635099913, | |
| "grad_norm": 1.031202793121338, | |
| "learning_rate": 1.764427819207624e-05, | |
| "loss": 0.6004, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.25021720243266726, | |
| "grad_norm": 1.1144808530807495, | |
| "learning_rate": 1.757093501800196e-05, | |
| "loss": 0.5699, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.2536924413553432, | |
| "grad_norm": 1.0327179431915283, | |
| "learning_rate": 1.7496625002946702e-05, | |
| "loss": 0.6051, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.2571676802780191, | |
| "grad_norm": 1.080093264579773, | |
| "learning_rate": 1.7421357636619153e-05, | |
| "loss": 0.6494, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.26064291920069504, | |
| "grad_norm": 1.2163845300674438, | |
| "learning_rate": 1.734514253098589e-05, | |
| "loss": 0.6033, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.26411815812337097, | |
| "grad_norm": 1.1358786821365356, | |
| "learning_rate": 1.726798941904386e-05, | |
| "loss": 0.596, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.2675933970460469, | |
| "grad_norm": 1.0965262651443481, | |
| "learning_rate": 1.7189908153577473e-05, | |
| "loss": 0.5669, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.2710686359687228, | |
| "grad_norm": 1.2162927389144897, | |
| "learning_rate": 1.7110908705900322e-05, | |
| "loss": 0.6133, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.2745438748913988, | |
| "grad_norm": 1.186356544494629, | |
| "learning_rate": 1.7031001164581828e-05, | |
| "loss": 0.6405, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.27801911381407474, | |
| "grad_norm": 1.0369867086410522, | |
| "learning_rate": 1.6950195734158874e-05, | |
| "loss": 0.579, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.28149435273675066, | |
| "grad_norm": 1.0901113748550415, | |
| "learning_rate": 1.6868502733832647e-05, | |
| "loss": 0.5825, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.2849695916594266, | |
| "grad_norm": 1.0874531269073486, | |
| "learning_rate": 1.6785932596150827e-05, | |
| "loss": 0.6038, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.2884448305821025, | |
| "grad_norm": 1.1928529739379883, | |
| "learning_rate": 1.670249586567531e-05, | |
| "loss": 0.5979, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.29192006950477845, | |
| "grad_norm": 1.0388442277908325, | |
| "learning_rate": 1.6618203197635624e-05, | |
| "loss": 0.5832, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.2953953084274544, | |
| "grad_norm": 1.2084934711456299, | |
| "learning_rate": 1.6533065356568206e-05, | |
| "loss": 0.5844, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.2988705473501303, | |
| "grad_norm": 1.223413348197937, | |
| "learning_rate": 1.6447093214941727e-05, | |
| "loss": 0.619, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.3023457862728063, | |
| "grad_norm": 1.0778778791427612, | |
| "learning_rate": 1.636029775176862e-05, | |
| "loss": 0.6573, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.3058210251954822, | |
| "grad_norm": 1.0900267362594604, | |
| "learning_rate": 1.627269005120304e-05, | |
| "loss": 0.5779, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.30929626411815814, | |
| "grad_norm": 1.0737788677215576, | |
| "learning_rate": 1.618428130112533e-05, | |
| "loss": 0.5994, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.31277150304083406, | |
| "grad_norm": 1.0811336040496826, | |
| "learning_rate": 1.6095082791713322e-05, | |
| "loss": 0.626, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.31624674196351, | |
| "grad_norm": 1.0799121856689453, | |
| "learning_rate": 1.6005105914000508e-05, | |
| "loss": 0.6313, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.3197219808861859, | |
| "grad_norm": 1.0824054479599, | |
| "learning_rate": 1.5914362158421352e-05, | |
| "loss": 0.636, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.32319721980886185, | |
| "grad_norm": 1.0406136512756348, | |
| "learning_rate": 1.5822863113343934e-05, | |
| "loss": 0.5972, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.3266724587315378, | |
| "grad_norm": 1.16518235206604, | |
| "learning_rate": 1.5730620463590052e-05, | |
| "loss": 0.5728, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.3301476976542137, | |
| "grad_norm": 1.0442129373550415, | |
| "learning_rate": 1.5637645988943008e-05, | |
| "loss": 0.6138, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.3336229365768897, | |
| "grad_norm": 1.1471384763717651, | |
| "learning_rate": 1.554395156264331e-05, | |
| "loss": 0.6123, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.3370981754995656, | |
| "grad_norm": 1.0176016092300415, | |
| "learning_rate": 1.544954914987238e-05, | |
| "loss": 0.6063, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.34057341442224154, | |
| "grad_norm": 1.062016487121582, | |
| "learning_rate": 1.5354450806224553e-05, | |
| "loss": 0.5842, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.34404865334491747, | |
| "grad_norm": 1.1640706062316895, | |
| "learning_rate": 1.5258668676167548e-05, | |
| "loss": 0.5938, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.3475238922675934, | |
| "grad_norm": 1.1660997867584229, | |
| "learning_rate": 1.516221499149154e-05, | |
| "loss": 0.6202, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3509991311902693, | |
| "grad_norm": 1.156037449836731, | |
| "learning_rate": 1.5065102069747117e-05, | |
| "loss": 0.6216, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.35447437011294525, | |
| "grad_norm": 1.2295578718185425, | |
| "learning_rate": 1.4967342312672283e-05, | |
| "loss": 0.5649, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.3579496090356212, | |
| "grad_norm": 1.0619351863861084, | |
| "learning_rate": 1.48689482046087e-05, | |
| "loss": 0.5954, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.36142484795829716, | |
| "grad_norm": 1.266830325126648, | |
| "learning_rate": 1.4769932310907372e-05, | |
| "loss": 0.6362, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.3649000868809731, | |
| "grad_norm": 1.0485084056854248, | |
| "learning_rate": 1.467030727632401e-05, | |
| "loss": 0.5664, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.368375325803649, | |
| "grad_norm": 1.12933349609375, | |
| "learning_rate": 1.4570085823404232e-05, | |
| "loss": 0.586, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.37185056472632494, | |
| "grad_norm": 1.0767207145690918, | |
| "learning_rate": 1.4469280750858854e-05, | |
| "loss": 0.5773, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.37532580364900087, | |
| "grad_norm": 1.121981143951416, | |
| "learning_rate": 1.4367904931929422e-05, | |
| "loss": 0.585, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.3788010425716768, | |
| "grad_norm": 1.1495089530944824, | |
| "learning_rate": 1.4265971312744252e-05, | |
| "loss": 0.5698, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.3822762814943527, | |
| "grad_norm": 1.0745882987976074, | |
| "learning_rate": 1.4163492910665153e-05, | |
| "loss": 0.611, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.38575152041702865, | |
| "grad_norm": 1.0079902410507202, | |
| "learning_rate": 1.4060482812625055e-05, | |
| "loss": 0.6226, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.38922675933970463, | |
| "grad_norm": 1.0810552835464478, | |
| "learning_rate": 1.395695417345675e-05, | |
| "loss": 0.6257, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.39270199826238056, | |
| "grad_norm": 1.040677547454834, | |
| "learning_rate": 1.3852920214212966e-05, | |
| "loss": 0.5883, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.3961772371850565, | |
| "grad_norm": 1.0766505002975464, | |
| "learning_rate": 1.3748394220477972e-05, | |
| "loss": 0.5804, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.3996524761077324, | |
| "grad_norm": 1.0619298219680786, | |
| "learning_rate": 1.3643389540670963e-05, | |
| "loss": 0.6178, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.40312771503040834, | |
| "grad_norm": 1.019912838935852, | |
| "learning_rate": 1.3537919584341413e-05, | |
| "loss": 0.5959, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.40660295395308427, | |
| "grad_norm": 1.113372564315796, | |
| "learning_rate": 1.3431997820456592e-05, | |
| "loss": 0.6051, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.4100781928757602, | |
| "grad_norm": 1.0494154691696167, | |
| "learning_rate": 1.3325637775681561e-05, | |
| "loss": 0.599, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.4135534317984361, | |
| "grad_norm": 1.1271395683288574, | |
| "learning_rate": 1.3218853032651719e-05, | |
| "loss": 0.6322, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.41702867072111205, | |
| "grad_norm": 1.1663187742233276, | |
| "learning_rate": 1.3111657228238263e-05, | |
| "loss": 0.6209, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.42050390964378803, | |
| "grad_norm": 1.0917779207229614, | |
| "learning_rate": 1.3004064051806712e-05, | |
| "loss": 0.5832, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.42397914856646396, | |
| "grad_norm": 1.1407371759414673, | |
| "learning_rate": 1.2896087243468673e-05, | |
| "loss": 0.5645, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.4274543874891399, | |
| "grad_norm": 1.1023014783859253, | |
| "learning_rate": 1.2787740592327232e-05, | |
| "loss": 0.5921, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.4309296264118158, | |
| "grad_norm": 1.1019343137741089, | |
| "learning_rate": 1.267903793471597e-05, | |
| "loss": 0.5687, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.43440486533449174, | |
| "grad_norm": 1.0315438508987427, | |
| "learning_rate": 1.2569993152432028e-05, | |
| "loss": 0.5973, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.43788010425716767, | |
| "grad_norm": 1.1791157722473145, | |
| "learning_rate": 1.2460620170963353e-05, | |
| "loss": 0.6029, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.4413553431798436, | |
| "grad_norm": 1.1693004369735718, | |
| "learning_rate": 1.2350932957710322e-05, | |
| "loss": 0.5916, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.4448305821025195, | |
| "grad_norm": 1.023739218711853, | |
| "learning_rate": 1.2240945520202079e-05, | |
| "loss": 0.5557, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.4483058210251955, | |
| "grad_norm": 1.2074741125106812, | |
| "learning_rate": 1.2130671904307692e-05, | |
| "loss": 0.605, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.45178105994787143, | |
| "grad_norm": 1.0737248659133911, | |
| "learning_rate": 1.202012619244243e-05, | |
| "loss": 0.6198, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.45525629887054736, | |
| "grad_norm": 1.0830988883972168, | |
| "learning_rate": 1.1909322501769407e-05, | |
| "loss": 0.5757, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.4587315377932233, | |
| "grad_norm": 1.050476312637329, | |
| "learning_rate": 1.1798274982396728e-05, | |
| "loss": 0.5597, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.4622067767158992, | |
| "grad_norm": 0.9489808678627014, | |
| "learning_rate": 1.1686997815570473e-05, | |
| "loss": 0.5686, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.46568201563857514, | |
| "grad_norm": 1.009084701538086, | |
| "learning_rate": 1.15755052118637e-05, | |
| "loss": 0.6, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.46915725456125107, | |
| "grad_norm": 1.058876872062683, | |
| "learning_rate": 1.1463811409361667e-05, | |
| "loss": 0.5479, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.472632493483927, | |
| "grad_norm": 1.0262316465377808, | |
| "learning_rate": 1.13519306718436e-05, | |
| "loss": 0.6045, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.476107732406603, | |
| "grad_norm": 1.0066461563110352, | |
| "learning_rate": 1.1239877286961123e-05, | |
| "loss": 0.561, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.4795829713292789, | |
| "grad_norm": 0.9656773209571838, | |
| "learning_rate": 1.112766556441367e-05, | |
| "loss": 0.6139, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.48305821025195483, | |
| "grad_norm": 1.0848320722579956, | |
| "learning_rate": 1.1015309834121083e-05, | |
| "loss": 0.5811, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.48653344917463076, | |
| "grad_norm": 1.0625325441360474, | |
| "learning_rate": 1.0902824444393602e-05, | |
| "loss": 0.6299, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4900086880973067, | |
| "grad_norm": 1.0536874532699585, | |
| "learning_rate": 1.079022376009955e-05, | |
| "loss": 0.5937, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.4934839270199826, | |
| "grad_norm": 1.0022127628326416, | |
| "learning_rate": 1.067752216083085e-05, | |
| "loss": 0.5718, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.49695916594265854, | |
| "grad_norm": 1.0636472702026367, | |
| "learning_rate": 1.05647340390667e-05, | |
| "loss": 0.6251, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.5004344048653345, | |
| "grad_norm": 1.0569506883621216, | |
| "learning_rate": 1.0451873798335605e-05, | |
| "loss": 0.5583, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.5039096437880104, | |
| "grad_norm": 1.04555344581604, | |
| "learning_rate": 1.0338955851375962e-05, | |
| "loss": 0.5893, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.5073848827106864, | |
| "grad_norm": 0.9304705858230591, | |
| "learning_rate": 1.0225994618295507e-05, | |
| "loss": 0.5414, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.5108601216333623, | |
| "grad_norm": 0.9779573082923889, | |
| "learning_rate": 1.01130045247298e-05, | |
| "loss": 0.5601, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.5143353605560382, | |
| "grad_norm": 1.0225908756256104, | |
| "learning_rate": 1e-05, | |
| "loss": 0.65, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.5178105994787141, | |
| "grad_norm": 0.9953768253326416, | |
| "learning_rate": 9.886995475270205e-06, | |
| "loss": 0.5905, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.5212858384013901, | |
| "grad_norm": 1.0968619585037231, | |
| "learning_rate": 9.774005381704498e-06, | |
| "loss": 0.5995, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5247610773240661, | |
| "grad_norm": 0.9768278002738953, | |
| "learning_rate": 9.661044148624038e-06, | |
| "loss": 0.642, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.5282363162467419, | |
| "grad_norm": 1.0109375715255737, | |
| "learning_rate": 9.548126201664398e-06, | |
| "loss": 0.534, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.5317115551694179, | |
| "grad_norm": 1.0256866216659546, | |
| "learning_rate": 9.435265960933304e-06, | |
| "loss": 0.6327, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.5351867940920938, | |
| "grad_norm": 0.9121344089508057, | |
| "learning_rate": 9.322477839169156e-06, | |
| "loss": 0.5787, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.5386620330147698, | |
| "grad_norm": 0.9741325974464417, | |
| "learning_rate": 9.209776239900453e-06, | |
| "loss": 0.5954, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.5421372719374457, | |
| "grad_norm": 1.0003210306167603, | |
| "learning_rate": 9.097175555606396e-06, | |
| "loss": 0.5514, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.5456125108601216, | |
| "grad_norm": 1.0428141355514526, | |
| "learning_rate": 8.98469016587892e-06, | |
| "loss": 0.5798, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.5490877497827976, | |
| "grad_norm": 1.0282821655273438, | |
| "learning_rate": 8.872334435586333e-06, | |
| "loss": 0.5911, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.5525629887054735, | |
| "grad_norm": 0.9479502439498901, | |
| "learning_rate": 8.76012271303888e-06, | |
| "loss": 0.6158, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.5560382276281495, | |
| "grad_norm": 1.0172061920166016, | |
| "learning_rate": 8.648069328156403e-06, | |
| "loss": 0.5601, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5595134665508253, | |
| "grad_norm": 1.0053855180740356, | |
| "learning_rate": 8.536188590638334e-06, | |
| "loss": 0.6152, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.5629887054735013, | |
| "grad_norm": 0.974694550037384, | |
| "learning_rate": 8.424494788136303e-06, | |
| "loss": 0.5666, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.5664639443961772, | |
| "grad_norm": 0.9912855625152588, | |
| "learning_rate": 8.313002184429529e-06, | |
| "loss": 0.5695, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.5699391833188532, | |
| "grad_norm": 0.962709367275238, | |
| "learning_rate": 8.201725017603277e-06, | |
| "loss": 0.5357, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.573414422241529, | |
| "grad_norm": 0.9394634962081909, | |
| "learning_rate": 8.090677498230598e-06, | |
| "loss": 0.564, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.576889661164205, | |
| "grad_norm": 0.9721453785896301, | |
| "learning_rate": 7.97987380755757e-06, | |
| "loss": 0.5933, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.580364900086881, | |
| "grad_norm": 1.0866742134094238, | |
| "learning_rate": 7.869328095692313e-06, | |
| "loss": 0.5811, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.5838401390095569, | |
| "grad_norm": 0.9584195613861084, | |
| "learning_rate": 7.759054479797924e-06, | |
| "loss": 0.5816, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.5873153779322329, | |
| "grad_norm": 1.0240747928619385, | |
| "learning_rate": 7.649067042289681e-06, | |
| "loss": 0.5688, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.5907906168549087, | |
| "grad_norm": 1.0707347393035889, | |
| "learning_rate": 7.539379829036652e-06, | |
| "loss": 0.5823, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5942658557775847, | |
| "grad_norm": 0.9794528484344482, | |
| "learning_rate": 7.430006847567972e-06, | |
| "loss": 0.5933, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.5977410947002606, | |
| "grad_norm": 1.1342881917953491, | |
| "learning_rate": 7.320962065284032e-06, | |
| "loss": 0.5611, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.6012163336229366, | |
| "grad_norm": 1.0687241554260254, | |
| "learning_rate": 7.2122594076727705e-06, | |
| "loss": 0.6206, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.6046915725456126, | |
| "grad_norm": 1.1404447555541992, | |
| "learning_rate": 7.1039127565313285e-06, | |
| "loss": 0.619, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.6081668114682884, | |
| "grad_norm": 1.0000810623168945, | |
| "learning_rate": 6.995935948193294e-06, | |
| "loss": 0.5557, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.6116420503909644, | |
| "grad_norm": 0.9445539712905884, | |
| "learning_rate": 6.888342771761737e-06, | |
| "loss": 0.5392, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.6151172893136403, | |
| "grad_norm": 0.9872927069664001, | |
| "learning_rate": 6.781146967348283e-06, | |
| "loss": 0.557, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.6185925282363163, | |
| "grad_norm": 1.1482657194137573, | |
| "learning_rate": 6.6743622243184405e-06, | |
| "loss": 0.6229, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.6220677671589921, | |
| "grad_norm": 0.9911180138587952, | |
| "learning_rate": 6.568002179543409e-06, | |
| "loss": 0.5777, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.6255430060816681, | |
| "grad_norm": 1.0723944902420044, | |
| "learning_rate": 6.462080415658591e-06, | |
| "loss": 0.5546, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.629018245004344, | |
| "grad_norm": 1.0823560953140259, | |
| "learning_rate": 6.356610459329038e-06, | |
| "loss": 0.5811, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.63249348392702, | |
| "grad_norm": 1.0165990591049194, | |
| "learning_rate": 6.251605779522032e-06, | |
| "loss": 0.5626, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.635968722849696, | |
| "grad_norm": 0.9735214114189148, | |
| "learning_rate": 6.147079785787038e-06, | |
| "loss": 0.5326, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.6394439617723718, | |
| "grad_norm": 0.993291974067688, | |
| "learning_rate": 6.043045826543254e-06, | |
| "loss": 0.5609, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.6429192006950478, | |
| "grad_norm": 0.9579317569732666, | |
| "learning_rate": 5.93951718737495e-06, | |
| "loss": 0.5593, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.6463944396177237, | |
| "grad_norm": 0.8985302448272705, | |
| "learning_rate": 5.836507089334849e-06, | |
| "loss": 0.5391, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.6498696785403997, | |
| "grad_norm": 1.0520668029785156, | |
| "learning_rate": 5.7340286872557515e-06, | |
| "loss": 0.6108, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.6533449174630755, | |
| "grad_norm": 0.9960751533508301, | |
| "learning_rate": 5.6320950680705826e-06, | |
| "loss": 0.5867, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.6568201563857515, | |
| "grad_norm": 1.0133719444274902, | |
| "learning_rate": 5.530719249141148e-06, | |
| "loss": 0.5313, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.6602953953084274, | |
| "grad_norm": 0.9764801859855652, | |
| "learning_rate": 5.429914176595772e-06, | |
| "loss": 0.559, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6637706342311034, | |
| "grad_norm": 1.0536527633666992, | |
| "learning_rate": 5.329692723675994e-06, | |
| "loss": 0.5625, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.6672458731537794, | |
| "grad_norm": 1.139304280281067, | |
| "learning_rate": 5.230067689092629e-06, | |
| "loss": 0.5941, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.6707211120764552, | |
| "grad_norm": 1.0412135124206543, | |
| "learning_rate": 5.131051795391302e-06, | |
| "loss": 0.6064, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.6741963509991312, | |
| "grad_norm": 1.0304534435272217, | |
| "learning_rate": 5.03265768732772e-06, | |
| "loss": 0.5576, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.6776715899218071, | |
| "grad_norm": 0.9016774892807007, | |
| "learning_rate": 4.934897930252887e-06, | |
| "loss": 0.59, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.6811468288444831, | |
| "grad_norm": 0.9751427173614502, | |
| "learning_rate": 4.837785008508462e-06, | |
| "loss": 0.587, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.684622067767159, | |
| "grad_norm": 0.9640844464302063, | |
| "learning_rate": 4.7413313238324556e-06, | |
| "loss": 0.5332, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.6880973066898349, | |
| "grad_norm": 1.0356625318527222, | |
| "learning_rate": 4.645549193775452e-06, | |
| "loss": 0.5798, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.6915725456125109, | |
| "grad_norm": 0.9761075377464294, | |
| "learning_rate": 4.550450850127626e-06, | |
| "loss": 0.5669, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.6950477845351868, | |
| "grad_norm": 0.9727676510810852, | |
| "learning_rate": 4.4560484373566945e-06, | |
| "loss": 0.5526, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6985230234578628, | |
| "grad_norm": 0.9600728750228882, | |
| "learning_rate": 4.3623540110569935e-06, | |
| "loss": 0.5225, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.7019982623805386, | |
| "grad_norm": 1.075377345085144, | |
| "learning_rate": 4.26937953640995e-06, | |
| "loss": 0.6123, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.7054735013032146, | |
| "grad_norm": 0.9740473031997681, | |
| "learning_rate": 4.177136886656067e-06, | |
| "loss": 0.5718, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.7089487402258905, | |
| "grad_norm": 0.894684374332428, | |
| "learning_rate": 4.085637841578652e-06, | |
| "loss": 0.6006, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.7124239791485665, | |
| "grad_norm": 1.0192121267318726, | |
| "learning_rate": 3.9948940859994964e-06, | |
| "loss": 0.5745, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.7158992180712423, | |
| "grad_norm": 1.0569602251052856, | |
| "learning_rate": 3.9049172082866786e-06, | |
| "loss": 0.6085, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.7193744569939183, | |
| "grad_norm": 1.0002917051315308, | |
| "learning_rate": 3.815718698874672e-06, | |
| "loss": 0.5586, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.7228496959165943, | |
| "grad_norm": 0.9531834125518799, | |
| "learning_rate": 3.727309948796963e-06, | |
| "loss": 0.5927, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.7263249348392702, | |
| "grad_norm": 1.045615553855896, | |
| "learning_rate": 3.6397022482313804e-06, | |
| "loss": 0.5594, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.7298001737619462, | |
| "grad_norm": 0.9334245324134827, | |
| "learning_rate": 3.552906785058278e-06, | |
| "loss": 0.5364, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.733275412684622, | |
| "grad_norm": 0.9601573348045349, | |
| "learning_rate": 3.466934643431795e-06, | |
| "loss": 0.5571, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.736750651607298, | |
| "grad_norm": 0.95643150806427, | |
| "learning_rate": 3.3817968023643766e-06, | |
| "loss": 0.5717, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.7402258905299739, | |
| "grad_norm": 0.9626767039299011, | |
| "learning_rate": 3.2975041343246937e-06, | |
| "loss": 0.54, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.7437011294526499, | |
| "grad_norm": 0.9196009635925293, | |
| "learning_rate": 3.214067403849179e-06, | |
| "loss": 0.5807, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.7471763683753258, | |
| "grad_norm": 1.049148440361023, | |
| "learning_rate": 3.1314972661673572e-06, | |
| "loss": 0.6214, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.7506516072980017, | |
| "grad_norm": 0.9988502264022827, | |
| "learning_rate": 3.0498042658411276e-06, | |
| "loss": 0.5561, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.7541268462206777, | |
| "grad_norm": 1.0174543857574463, | |
| "learning_rate": 2.9689988354181742e-06, | |
| "loss": 0.5698, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.7576020851433536, | |
| "grad_norm": 1.0161805152893066, | |
| "learning_rate": 2.8890912940996784e-06, | |
| "loss": 0.5478, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.7610773240660296, | |
| "grad_norm": 0.9427905082702637, | |
| "learning_rate": 2.8100918464225304e-06, | |
| "loss": 0.5637, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.7645525629887054, | |
| "grad_norm": 1.0057843923568726, | |
| "learning_rate": 2.7320105809561415e-06, | |
| "loss": 0.5674, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7680278019113814, | |
| "grad_norm": 1.013282299041748, | |
| "learning_rate": 2.654857469014113e-06, | |
| "loss": 0.5863, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.7715030408340573, | |
| "grad_norm": 0.980964720249176, | |
| "learning_rate": 2.5786423633808487e-06, | |
| "loss": 0.573, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.7749782797567333, | |
| "grad_norm": 0.9535639882087708, | |
| "learning_rate": 2.5033749970533015e-06, | |
| "loss": 0.5594, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.7784535186794093, | |
| "grad_norm": 0.9691473841667175, | |
| "learning_rate": 2.4290649819980404e-06, | |
| "loss": 0.5937, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.7819287576020851, | |
| "grad_norm": 0.9758164286613464, | |
| "learning_rate": 2.3557218079237608e-06, | |
| "loss": 0.5412, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.7854039965247611, | |
| "grad_norm": 0.9405572414398193, | |
| "learning_rate": 2.283354841069403e-06, | |
| "loss": 0.5655, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.788879235447437, | |
| "grad_norm": 0.8931716680526733, | |
| "learning_rate": 2.211973323008041e-06, | |
| "loss": 0.5258, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.792354474370113, | |
| "grad_norm": 0.9230597019195557, | |
| "learning_rate": 2.1415863694666973e-06, | |
| "loss": 0.5538, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.7958297132927888, | |
| "grad_norm": 0.9798585176467896, | |
| "learning_rate": 2.072202969162234e-06, | |
| "loss": 0.5556, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.7993049522154648, | |
| "grad_norm": 0.9468632340431213, | |
| "learning_rate": 2.0038319826534312e-06, | |
| "loss": 0.597, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.8027801911381407, | |
| "grad_norm": 0.9490894079208374, | |
| "learning_rate": 1.936482141209486e-06, | |
| "loss": 0.5645, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.8062554300608167, | |
| "grad_norm": 0.9810106754302979, | |
| "learning_rate": 1.870162045694971e-06, | |
| "loss": 0.5281, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.8097306689834927, | |
| "grad_norm": 0.9110345840454102, | |
| "learning_rate": 1.8048801654714687e-06, | |
| "loss": 0.5774, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.8132059079061685, | |
| "grad_norm": 0.9290068745613098, | |
| "learning_rate": 1.7406448373160024e-06, | |
| "loss": 0.6053, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.8166811468288445, | |
| "grad_norm": 0.9238559603691101, | |
| "learning_rate": 1.6774642643563955e-06, | |
| "loss": 0.5125, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.8201563857515204, | |
| "grad_norm": 1.0130783319473267, | |
| "learning_rate": 1.615346515023698e-06, | |
| "loss": 0.5873, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.8236316246741964, | |
| "grad_norm": 0.9619508981704712, | |
| "learning_rate": 1.5542995220217961e-06, | |
| "loss": 0.5979, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.8271068635968722, | |
| "grad_norm": 1.0197911262512207, | |
| "learning_rate": 1.4943310813144006e-06, | |
| "loss": 0.6156, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.8305821025195482, | |
| "grad_norm": 0.9349676370620728, | |
| "learning_rate": 1.4354488511294418e-06, | |
| "loss": 0.5689, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.8340573414422241, | |
| "grad_norm": 1.0270828008651733, | |
| "learning_rate": 1.3776603509810938e-06, | |
| "loss": 0.5397, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.8375325803649001, | |
| "grad_norm": 0.9381040334701538, | |
| "learning_rate": 1.3209729607095022e-06, | |
| "loss": 0.542, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.8410078192875761, | |
| "grad_norm": 1.0486862659454346, | |
| "learning_rate": 1.2653939195383448e-06, | |
| "loss": 0.5886, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.8444830582102519, | |
| "grad_norm": 0.9653035402297974, | |
| "learning_rate": 1.2109303251503434e-06, | |
| "loss": 0.5893, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.8479582971329279, | |
| "grad_norm": 0.9536635279655457, | |
| "learning_rate": 1.1575891327808664e-06, | |
| "loss": 0.5728, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.8514335360556038, | |
| "grad_norm": 0.9552822113037109, | |
| "learning_rate": 1.1053771543297198e-06, | |
| "loss": 0.5335, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.8549087749782798, | |
| "grad_norm": 0.9501549005508423, | |
| "learning_rate": 1.0543010574912305e-06, | |
| "loss": 0.5696, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.8583840139009556, | |
| "grad_norm": 0.9457242488861084, | |
| "learning_rate": 1.0043673649027519e-06, | |
| "loss": 0.5974, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.8618592528236316, | |
| "grad_norm": 0.9081954956054688, | |
| "learning_rate": 9.555824533117064e-07, | |
| "loss": 0.5984, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.8653344917463076, | |
| "grad_norm": 0.9678678512573242, | |
| "learning_rate": 9.079525527612321e-07, | |
| "loss": 0.5463, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.8688097306689835, | |
| "grad_norm": 0.9402781128883362, | |
| "learning_rate": 8.614837457945868e-07, | |
| "loss": 0.5831, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.8722849695916595, | |
| "grad_norm": 0.8864692449569702, | |
| "learning_rate": 8.161819666783888e-07, | |
| "loss": 0.6336, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.8757602085143353, | |
| "grad_norm": 0.9280177354812622, | |
| "learning_rate": 7.720530006447735e-07, | |
| "loss": 0.5792, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.8792354474370113, | |
| "grad_norm": 0.9325032830238342, | |
| "learning_rate": 7.291024831525961e-07, | |
| "loss": 0.5374, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.8827106863596872, | |
| "grad_norm": 0.8580663800239563, | |
| "learning_rate": 6.87335899167767e-07, | |
| "loss": 0.5319, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.8861859252823632, | |
| "grad_norm": 0.98850017786026, | |
| "learning_rate": 6.467585824627886e-07, | |
| "loss": 0.6334, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.889661164205039, | |
| "grad_norm": 0.9273893237113953, | |
| "learning_rate": 6.073757149356185e-07, | |
| "loss": 0.5404, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.893136403127715, | |
| "grad_norm": 0.9261394739151001, | |
| "learning_rate": 5.691923259479093e-07, | |
| "loss": 0.553, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.896611642050391, | |
| "grad_norm": 0.9248658418655396, | |
| "learning_rate": 5.322132916827483e-07, | |
| "loss": 0.5835, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.9000868809730669, | |
| "grad_norm": 0.9358659982681274, | |
| "learning_rate": 4.964433345219354e-07, | |
| "loss": 0.6004, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.9035621198957429, | |
| "grad_norm": 1.0171351432800293, | |
| "learning_rate": 4.6188702244292614e-07, | |
| "loss": 0.5684, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.9070373588184187, | |
| "grad_norm": 0.9322757720947266, | |
| "learning_rate": 4.285487684354772e-07, | |
| "loss": 0.5311, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.9105125977410947, | |
| "grad_norm": 0.9555125832557678, | |
| "learning_rate": 3.96432829938086e-07, | |
| "loss": 0.5627, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.9139878366637706, | |
| "grad_norm": 0.8531783223152161, | |
| "learning_rate": 3.6554330829429716e-07, | |
| "loss": 0.5249, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.9174630755864466, | |
| "grad_norm": 0.9125425815582275, | |
| "learning_rate": 3.3588414822895097e-07, | |
| "loss": 0.5257, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.9209383145091226, | |
| "grad_norm": 0.9245027899742126, | |
| "learning_rate": 3.0745913734441357e-07, | |
| "loss": 0.5328, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.9244135534317984, | |
| "grad_norm": 0.9753005504608154, | |
| "learning_rate": 2.8027190563689745e-07, | |
| "loss": 0.5431, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.9278887923544744, | |
| "grad_norm": 0.9659878611564636, | |
| "learning_rate": 2.5432592503288e-07, | |
| "loss": 0.5794, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.9313640312771503, | |
| "grad_norm": 1.0079331398010254, | |
| "learning_rate": 2.2962450894573606e-07, | |
| "loss": 0.5434, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.9348392701998263, | |
| "grad_norm": 0.9261694550514221, | |
| "learning_rate": 2.0617081185259512e-07, | |
| "loss": 0.5718, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.9383145091225021, | |
| "grad_norm": 0.8658697009086609, | |
| "learning_rate": 1.8396782889150144e-07, | |
| "loss": 0.5553, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.9417897480451781, | |
| "grad_norm": 0.9265699982643127, | |
| "learning_rate": 1.630183954789233e-07, | |
| "loss": 0.551, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.945264986967854, | |
| "grad_norm": 0.9904604554176331, | |
| "learning_rate": 1.4332518694765708e-07, | |
| "loss": 0.5113, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.94874022589053, | |
| "grad_norm": 0.9163973331451416, | |
| "learning_rate": 1.2489071820517394e-07, | |
| "loss": 0.587, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.952215464813206, | |
| "grad_norm": 0.8994106650352478, | |
| "learning_rate": 1.0771734341246121e-07, | |
| "loss": 0.5273, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.9556907037358818, | |
| "grad_norm": 0.9306617975234985, | |
| "learning_rate": 9.180725568338045e-08, | |
| "loss": 0.5598, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.9591659426585578, | |
| "grad_norm": 0.9544433355331421, | |
| "learning_rate": 7.716248680459726e-08, | |
| "loss": 0.5608, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.9626411815812337, | |
| "grad_norm": 0.9377245903015137, | |
| "learning_rate": 6.378490697611761e-08, | |
| "loss": 0.5477, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.9661164205039097, | |
| "grad_norm": 0.8245200514793396, | |
| "learning_rate": 5.1676224572452246e-08, | |
| "loss": 0.5014, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.9695916594265855, | |
| "grad_norm": 0.8969350457191467, | |
| "learning_rate": 4.083798592444899e-08, | |
| "loss": 0.5899, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.9730668983492615, | |
| "grad_norm": 0.9808463454246521, | |
| "learning_rate": 3.127157512182288e-08, | |
| "loss": 0.5642, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.9765421372719374, | |
| "grad_norm": 0.9421252608299255, | |
| "learning_rate": 2.2978213836400974e-08, | |
| "loss": 0.5448, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.9800173761946134, | |
| "grad_norm": 0.9303473830223083, | |
| "learning_rate": 1.5958961166104847e-08, | |
| "loss": 0.5947, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.9834926151172894, | |
| "grad_norm": 1.075130820274353, | |
| "learning_rate": 1.0214713499706596e-08, | |
| "loss": 0.5925, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.9869678540399652, | |
| "grad_norm": 0.9997183680534363, | |
| "learning_rate": 5.7462044023515186e-09, | |
| "loss": 0.6126, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.9904430929626412, | |
| "grad_norm": 0.8891317248344421, | |
| "learning_rate": 2.5540045218819256e-09, | |
| "loss": 0.5529, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.9939183318853171, | |
| "grad_norm": 0.8776668310165405, | |
| "learning_rate": 6.385215159565583e-10, | |
| "loss": 0.5456, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.9973935708079931, | |
| "grad_norm": 0.8966051340103149, | |
| "learning_rate": 0.0, | |
| "loss": 0.5303, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.9973935708079931, | |
| "step": 287, | |
| "total_flos": 2.698045158024282e+18, | |
| "train_loss": 0.5919382667707649, | |
| "train_runtime": 4471.5794, | |
| "train_samples_per_second": 16.469, | |
| "train_steps_per_second": 0.064 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 287, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 400, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.698045158024282e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |