diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18963 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9987519068090416, + "eval_steps": 500, + "global_step": 2703, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001109416169740674, + "grad_norm": 0.06963343173265457, + "learning_rate": 1.107011070110701e-06, + "loss": 0.5928, + "step": 1 + }, + { + "epoch": 0.002218832339481348, + "grad_norm": 0.08431016653776169, + "learning_rate": 2.214022140221402e-06, + "loss": 0.7915, + "step": 2 + }, + { + "epoch": 0.003328248509222022, + "grad_norm": 0.06818119436502457, + "learning_rate": 3.321033210332103e-06, + "loss": 0.6594, + "step": 3 + }, + { + "epoch": 0.004437664678962696, + "grad_norm": 0.08563707023859024, + "learning_rate": 4.428044280442804e-06, + "loss": 0.5879, + "step": 4 + }, + { + "epoch": 0.00554708084870337, + "grad_norm": 0.053242169320583344, + "learning_rate": 5.535055350553505e-06, + "loss": 0.6156, + "step": 5 + }, + { + "epoch": 0.006656497018444044, + "grad_norm": 0.06964253634214401, + "learning_rate": 6.642066420664206e-06, + "loss": 0.7194, + "step": 6 + }, + { + "epoch": 0.007765913188184718, + "grad_norm": 0.07358719408512115, + "learning_rate": 7.749077490774907e-06, + "loss": 0.6253, + "step": 7 + }, + { + "epoch": 0.008875329357925392, + "grad_norm": 0.04413611814379692, + "learning_rate": 8.856088560885607e-06, + "loss": 0.5087, + "step": 8 + }, + { + "epoch": 0.009984745527666065, + "grad_norm": 0.06562013179063797, + "learning_rate": 9.96309963099631e-06, + "loss": 0.7106, + "step": 9 + }, + { + "epoch": 0.01109416169740674, + "grad_norm": 0.06969469785690308, + "learning_rate": 1.107011070110701e-05, + "loss": 0.5231, + "step": 10 + }, + { + "epoch": 0.012203577867147413, + "grad_norm": 0.06136368215084076, + "learning_rate": 1.2177121771217711e-05, + "loss": 0.5043, + "step": 11 + }, + { + "epoch": 0.013312994036888088, + "grad_norm": 0.22236867249011993, + "learning_rate": 1.3284132841328412e-05, + "loss": 1.1332, + "step": 12 + }, + { + "epoch": 0.014422410206628761, + "grad_norm": 0.05163656920194626, + "learning_rate": 1.4391143911439114e-05, + "loss": 0.6196, + "step": 13 + }, + { + "epoch": 0.015531826376369436, + "grad_norm": 0.07425156235694885, + "learning_rate": 1.5498154981549814e-05, + "loss": 0.6824, + "step": 14 + }, + { + "epoch": 0.01664124254611011, + "grad_norm": 0.10796231776475906, + "learning_rate": 1.6605166051660514e-05, + "loss": 0.6668, + "step": 15 + }, + { + "epoch": 0.017750658715850784, + "grad_norm": 0.08407709002494812, + "learning_rate": 1.7712177121771215e-05, + "loss": 0.6919, + "step": 16 + }, + { + "epoch": 0.01886007488559146, + "grad_norm": 0.09712005406618118, + "learning_rate": 1.8819188191881916e-05, + "loss": 0.6337, + "step": 17 + }, + { + "epoch": 0.01996949105533213, + "grad_norm": 0.09893519431352615, + "learning_rate": 1.992619926199262e-05, + "loss": 0.5787, + "step": 18 + }, + { + "epoch": 0.021078907225072805, + "grad_norm": 0.06742904335260391, + "learning_rate": 2.1033210332103317e-05, + "loss": 0.5236, + "step": 19 + }, + { + "epoch": 0.02218832339481348, + "grad_norm": 0.09684479981660843, + "learning_rate": 2.214022140221402e-05, + "loss": 0.8853, + "step": 20 + }, + { + "epoch": 0.023297739564554155, + "grad_norm": 0.07542835175991058, + "learning_rate": 2.3247232472324722e-05, + "loss": 0.5122, + "step": 21 + }, + { + "epoch": 0.024407155734294826, + "grad_norm": 0.10576613247394562, + "learning_rate": 2.4354243542435423e-05, + "loss": 0.5764, + "step": 22 + }, + { + "epoch": 0.0255165719040355, + "grad_norm": 0.09776762127876282, + "learning_rate": 2.5461254612546123e-05, + "loss": 0.5908, + "step": 23 + }, + { + "epoch": 0.026625988073776176, + "grad_norm": 0.11171472072601318, + "learning_rate": 2.6568265682656824e-05, + "loss": 0.6293, + "step": 24 + }, + { + "epoch": 0.02773540424351685, + "grad_norm": 0.07923085242509842, + "learning_rate": 2.7675276752767525e-05, + "loss": 0.5343, + "step": 25 + }, + { + "epoch": 0.028844820413257522, + "grad_norm": 0.13260161876678467, + "learning_rate": 2.878228782287823e-05, + "loss": 0.6485, + "step": 26 + }, + { + "epoch": 0.029954236582998197, + "grad_norm": 0.08974039554595947, + "learning_rate": 2.9889298892988926e-05, + "loss": 0.5814, + "step": 27 + }, + { + "epoch": 0.031063652752738872, + "grad_norm": 0.08319084346294403, + "learning_rate": 3.099630996309963e-05, + "loss": 0.5042, + "step": 28 + }, + { + "epoch": 0.03217306892247954, + "grad_norm": 0.21256183087825775, + "learning_rate": 3.2103321033210324e-05, + "loss": 0.7798, + "step": 29 + }, + { + "epoch": 0.03328248509222022, + "grad_norm": 0.15112732350826263, + "learning_rate": 3.321033210332103e-05, + "loss": 0.6845, + "step": 30 + }, + { + "epoch": 0.03439190126196089, + "grad_norm": 0.09808047860860825, + "learning_rate": 3.431734317343173e-05, + "loss": 0.631, + "step": 31 + }, + { + "epoch": 0.03550131743170157, + "grad_norm": 0.12207363545894623, + "learning_rate": 3.542435424354243e-05, + "loss": 0.6708, + "step": 32 + }, + { + "epoch": 0.03661073360144224, + "grad_norm": 0.11131646484136581, + "learning_rate": 3.6531365313653134e-05, + "loss": 0.4812, + "step": 33 + }, + { + "epoch": 0.03772014977118292, + "grad_norm": 0.15473362803459167, + "learning_rate": 3.763837638376383e-05, + "loss": 0.5202, + "step": 34 + }, + { + "epoch": 0.03882956594092359, + "grad_norm": 0.1214248314499855, + "learning_rate": 3.8745387453874535e-05, + "loss": 0.5629, + "step": 35 + }, + { + "epoch": 0.03993898211066426, + "grad_norm": 0.12858295440673828, + "learning_rate": 3.985239852398524e-05, + "loss": 0.6535, + "step": 36 + }, + { + "epoch": 0.041048398280404935, + "grad_norm": 0.23708263039588928, + "learning_rate": 4.0959409594095944e-05, + "loss": 0.7139, + "step": 37 + }, + { + "epoch": 0.04215781445014561, + "grad_norm": 0.11044642329216003, + "learning_rate": 4.2066420664206634e-05, + "loss": 0.5751, + "step": 38 + }, + { + "epoch": 0.043267230619886285, + "grad_norm": 0.23254919052124023, + "learning_rate": 4.317343173431734e-05, + "loss": 0.4629, + "step": 39 + }, + { + "epoch": 0.04437664678962696, + "grad_norm": 0.16974490880966187, + "learning_rate": 4.428044280442804e-05, + "loss": 0.7469, + "step": 40 + }, + { + "epoch": 0.045486062959367635, + "grad_norm": 0.1873021423816681, + "learning_rate": 4.538745387453874e-05, + "loss": 0.6589, + "step": 41 + }, + { + "epoch": 0.04659547912910831, + "grad_norm": 0.17700991034507751, + "learning_rate": 4.6494464944649444e-05, + "loss": 0.7062, + "step": 42 + }, + { + "epoch": 0.04770489529884898, + "grad_norm": 0.20159931480884552, + "learning_rate": 4.760147601476014e-05, + "loss": 0.6955, + "step": 43 + }, + { + "epoch": 0.04881431146858965, + "grad_norm": 0.18304705619812012, + "learning_rate": 4.8708487084870845e-05, + "loss": 0.6543, + "step": 44 + }, + { + "epoch": 0.04992372763833033, + "grad_norm": 0.1452740579843521, + "learning_rate": 4.981549815498154e-05, + "loss": 0.5875, + "step": 45 + }, + { + "epoch": 0.051033143808071, + "grad_norm": 0.16509367525577545, + "learning_rate": 5.092250922509225e-05, + "loss": 0.4751, + "step": 46 + }, + { + "epoch": 0.05214255997781168, + "grad_norm": 0.19247905910015106, + "learning_rate": 5.202952029520295e-05, + "loss": 0.4989, + "step": 47 + }, + { + "epoch": 0.05325197614755235, + "grad_norm": 0.20170418918132782, + "learning_rate": 5.313653136531365e-05, + "loss": 0.6385, + "step": 48 + }, + { + "epoch": 0.05436139231729303, + "grad_norm": 0.21076518297195435, + "learning_rate": 5.4243542435424346e-05, + "loss": 0.4535, + "step": 49 + }, + { + "epoch": 0.0554708084870337, + "grad_norm": 0.16010227799415588, + "learning_rate": 5.535055350553505e-05, + "loss": 0.5345, + "step": 50 + }, + { + "epoch": 0.05658022465677437, + "grad_norm": 0.21507202088832855, + "learning_rate": 5.6457564575645754e-05, + "loss": 0.5626, + "step": 51 + }, + { + "epoch": 0.057689640826515044, + "grad_norm": 0.2764558494091034, + "learning_rate": 5.756457564575646e-05, + "loss": 0.7726, + "step": 52 + }, + { + "epoch": 0.05879905699625572, + "grad_norm": 0.19346186518669128, + "learning_rate": 5.867158671586715e-05, + "loss": 0.6975, + "step": 53 + }, + { + "epoch": 0.059908473165996394, + "grad_norm": 0.15796959400177002, + "learning_rate": 5.977859778597785e-05, + "loss": 0.5838, + "step": 54 + }, + { + "epoch": 0.06101788933573707, + "grad_norm": 0.26421332359313965, + "learning_rate": 6.088560885608856e-05, + "loss": 0.4547, + "step": 55 + }, + { + "epoch": 0.062127305505477744, + "grad_norm": 0.15716983377933502, + "learning_rate": 6.199261992619925e-05, + "loss": 0.5811, + "step": 56 + }, + { + "epoch": 0.06323672167521842, + "grad_norm": 0.31450599431991577, + "learning_rate": 6.309963099630996e-05, + "loss": 0.6177, + "step": 57 + }, + { + "epoch": 0.06434613784495909, + "grad_norm": 0.1722506731748581, + "learning_rate": 6.420664206642065e-05, + "loss": 0.4451, + "step": 58 + }, + { + "epoch": 0.06545555401469977, + "grad_norm": 0.19947190582752228, + "learning_rate": 6.531365313653135e-05, + "loss": 0.6172, + "step": 59 + }, + { + "epoch": 0.06656497018444044, + "grad_norm": 0.24372801184654236, + "learning_rate": 6.642066420664206e-05, + "loss": 0.7055, + "step": 60 + }, + { + "epoch": 0.06767438635418112, + "grad_norm": 0.2935488224029541, + "learning_rate": 6.752767527675276e-05, + "loss": 0.6359, + "step": 61 + }, + { + "epoch": 0.06878380252392179, + "grad_norm": 0.30399176478385925, + "learning_rate": 6.863468634686347e-05, + "loss": 0.6141, + "step": 62 + }, + { + "epoch": 0.06989321869366245, + "grad_norm": 0.2780139446258545, + "learning_rate": 6.974169741697416e-05, + "loss": 0.6173, + "step": 63 + }, + { + "epoch": 0.07100263486340314, + "grad_norm": 0.22119928896427155, + "learning_rate": 7.084870848708486e-05, + "loss": 0.6311, + "step": 64 + }, + { + "epoch": 0.0721120510331438, + "grad_norm": 0.17710696160793304, + "learning_rate": 7.195571955719556e-05, + "loss": 0.5011, + "step": 65 + }, + { + "epoch": 0.07322146720288449, + "grad_norm": 0.24316059052944183, + "learning_rate": 7.306273062730627e-05, + "loss": 0.5671, + "step": 66 + }, + { + "epoch": 0.07433088337262515, + "grad_norm": 0.2396726906299591, + "learning_rate": 7.416974169741697e-05, + "loss": 0.5996, + "step": 67 + }, + { + "epoch": 0.07544029954236584, + "grad_norm": 0.25187021493911743, + "learning_rate": 7.527675276752766e-05, + "loss": 0.4867, + "step": 68 + }, + { + "epoch": 0.0765497157121065, + "grad_norm": 0.18911105394363403, + "learning_rate": 7.638376383763837e-05, + "loss": 0.5302, + "step": 69 + }, + { + "epoch": 0.07765913188184718, + "grad_norm": 0.2970142364501953, + "learning_rate": 7.749077490774907e-05, + "loss": 0.6817, + "step": 70 + }, + { + "epoch": 0.07876854805158785, + "grad_norm": 0.255367249250412, + "learning_rate": 7.859778597785978e-05, + "loss": 0.53, + "step": 71 + }, + { + "epoch": 0.07987796422132852, + "grad_norm": 0.2670186758041382, + "learning_rate": 7.970479704797048e-05, + "loss": 0.6765, + "step": 72 + }, + { + "epoch": 0.0809873803910692, + "grad_norm": 0.29176634550094604, + "learning_rate": 8.081180811808118e-05, + "loss": 0.6788, + "step": 73 + }, + { + "epoch": 0.08209679656080987, + "grad_norm": 0.1941952258348465, + "learning_rate": 8.191881918819189e-05, + "loss": 0.5154, + "step": 74 + }, + { + "epoch": 0.08320621273055055, + "grad_norm": 0.2700209617614746, + "learning_rate": 8.302583025830258e-05, + "loss": 0.6123, + "step": 75 + }, + { + "epoch": 0.08431562890029122, + "grad_norm": 0.2537286579608917, + "learning_rate": 8.413284132841327e-05, + "loss": 0.6377, + "step": 76 + }, + { + "epoch": 0.0854250450700319, + "grad_norm": 0.3083483874797821, + "learning_rate": 8.523985239852397e-05, + "loss": 0.6263, + "step": 77 + }, + { + "epoch": 0.08653446123977257, + "grad_norm": 0.1838807612657547, + "learning_rate": 8.634686346863468e-05, + "loss": 0.4246, + "step": 78 + }, + { + "epoch": 0.08764387740951324, + "grad_norm": 0.19544388353824615, + "learning_rate": 8.745387453874538e-05, + "loss": 0.6333, + "step": 79 + }, + { + "epoch": 0.08875329357925392, + "grad_norm": 0.11637648195028305, + "learning_rate": 8.856088560885608e-05, + "loss": 0.3302, + "step": 80 + }, + { + "epoch": 0.08986270974899459, + "grad_norm": 0.22084759175777435, + "learning_rate": 8.966789667896679e-05, + "loss": 0.5961, + "step": 81 + }, + { + "epoch": 0.09097212591873527, + "grad_norm": 0.20843982696533203, + "learning_rate": 9.077490774907748e-05, + "loss": 0.4923, + "step": 82 + }, + { + "epoch": 0.09208154208847594, + "grad_norm": 0.18717604875564575, + "learning_rate": 9.188191881918818e-05, + "loss": 0.7945, + "step": 83 + }, + { + "epoch": 0.09319095825821662, + "grad_norm": 0.5555791854858398, + "learning_rate": 9.298892988929889e-05, + "loss": 0.7418, + "step": 84 + }, + { + "epoch": 0.09430037442795729, + "grad_norm": 0.35459521412849426, + "learning_rate": 9.409594095940959e-05, + "loss": 0.4774, + "step": 85 + }, + { + "epoch": 0.09540979059769795, + "grad_norm": 0.33607375621795654, + "learning_rate": 9.520295202952028e-05, + "loss": 0.6068, + "step": 86 + }, + { + "epoch": 0.09651920676743864, + "grad_norm": 0.2963770031929016, + "learning_rate": 9.630996309963099e-05, + "loss": 0.6738, + "step": 87 + }, + { + "epoch": 0.0976286229371793, + "grad_norm": 0.1896924376487732, + "learning_rate": 9.741697416974169e-05, + "loss": 0.4507, + "step": 88 + }, + { + "epoch": 0.09873803910691999, + "grad_norm": 0.17777574062347412, + "learning_rate": 9.852398523985238e-05, + "loss": 0.7229, + "step": 89 + }, + { + "epoch": 0.09984745527666065, + "grad_norm": 0.2619733512401581, + "learning_rate": 9.963099630996309e-05, + "loss": 0.6666, + "step": 90 + }, + { + "epoch": 0.10095687144640134, + "grad_norm": 0.24837014079093933, + "learning_rate": 0.00010073800738007379, + "loss": 0.6532, + "step": 91 + }, + { + "epoch": 0.102066287616142, + "grad_norm": 0.16982443630695343, + "learning_rate": 0.0001018450184501845, + "loss": 0.6299, + "step": 92 + }, + { + "epoch": 0.10317570378588269, + "grad_norm": 0.25121009349823, + "learning_rate": 0.0001029520295202952, + "loss": 0.5286, + "step": 93 + }, + { + "epoch": 0.10428511995562335, + "grad_norm": 0.17865754663944244, + "learning_rate": 0.0001040590405904059, + "loss": 0.6088, + "step": 94 + }, + { + "epoch": 0.10539453612536402, + "grad_norm": 0.21300899982452393, + "learning_rate": 0.0001051660516605166, + "loss": 0.5066, + "step": 95 + }, + { + "epoch": 0.1065039522951047, + "grad_norm": 0.18651628494262695, + "learning_rate": 0.0001062730627306273, + "loss": 0.5439, + "step": 96 + }, + { + "epoch": 0.10761336846484537, + "grad_norm": 0.18930092453956604, + "learning_rate": 0.00010738007380073799, + "loss": 0.4745, + "step": 97 + }, + { + "epoch": 0.10872278463458605, + "grad_norm": 0.1967380791902542, + "learning_rate": 0.00010848708487084869, + "loss": 0.5471, + "step": 98 + }, + { + "epoch": 0.10983220080432672, + "grad_norm": 0.16241402924060822, + "learning_rate": 0.0001095940959409594, + "loss": 0.6002, + "step": 99 + }, + { + "epoch": 0.1109416169740674, + "grad_norm": 0.22282204031944275, + "learning_rate": 0.0001107011070110701, + "loss": 0.4687, + "step": 100 + }, + { + "epoch": 0.11205103314380807, + "grad_norm": 0.1546042561531067, + "learning_rate": 0.0001118081180811808, + "loss": 0.599, + "step": 101 + }, + { + "epoch": 0.11316044931354874, + "grad_norm": 0.19606682658195496, + "learning_rate": 0.00011291512915129151, + "loss": 0.5543, + "step": 102 + }, + { + "epoch": 0.11426986548328942, + "grad_norm": 0.16278928518295288, + "learning_rate": 0.00011402214022140221, + "loss": 0.4827, + "step": 103 + }, + { + "epoch": 0.11537928165303009, + "grad_norm": 0.18518386781215668, + "learning_rate": 0.00011512915129151292, + "loss": 0.6719, + "step": 104 + }, + { + "epoch": 0.11648869782277077, + "grad_norm": 0.3365796208381653, + "learning_rate": 0.00011623616236162362, + "loss": 0.3853, + "step": 105 + }, + { + "epoch": 0.11759811399251144, + "grad_norm": 0.15232332050800323, + "learning_rate": 0.0001173431734317343, + "loss": 0.6667, + "step": 106 + }, + { + "epoch": 0.11870753016225212, + "grad_norm": 0.15742583572864532, + "learning_rate": 0.000118450184501845, + "loss": 0.6997, + "step": 107 + }, + { + "epoch": 0.11981694633199279, + "grad_norm": 0.18328164517879486, + "learning_rate": 0.0001195571955719557, + "loss": 0.4386, + "step": 108 + }, + { + "epoch": 0.12092636250173346, + "grad_norm": 0.19277305901050568, + "learning_rate": 0.00012066420664206641, + "loss": 0.4237, + "step": 109 + }, + { + "epoch": 0.12203577867147414, + "grad_norm": 0.20428280532360077, + "learning_rate": 0.00012177121771217711, + "loss": 0.8255, + "step": 110 + }, + { + "epoch": 0.1231451948412148, + "grad_norm": 0.21956562995910645, + "learning_rate": 0.0001228782287822878, + "loss": 0.5919, + "step": 111 + }, + { + "epoch": 0.12425461101095549, + "grad_norm": 0.16980376839637756, + "learning_rate": 0.0001239852398523985, + "loss": 0.3784, + "step": 112 + }, + { + "epoch": 0.12536402718069617, + "grad_norm": 0.14789903163909912, + "learning_rate": 0.0001250922509225092, + "loss": 0.4501, + "step": 113 + }, + { + "epoch": 0.12647344335043684, + "grad_norm": 0.22972984611988068, + "learning_rate": 0.00012619926199261992, + "loss": 0.5988, + "step": 114 + }, + { + "epoch": 0.1275828595201775, + "grad_norm": 0.345342218875885, + "learning_rate": 0.00012730627306273062, + "loss": 0.6277, + "step": 115 + }, + { + "epoch": 0.12869227568991817, + "grad_norm": 0.22726920247077942, + "learning_rate": 0.0001284132841328413, + "loss": 0.7774, + "step": 116 + }, + { + "epoch": 0.12980169185965884, + "grad_norm": 0.18863454461097717, + "learning_rate": 0.000129520295202952, + "loss": 0.8846, + "step": 117 + }, + { + "epoch": 0.13091110802939954, + "grad_norm": 0.14772231876850128, + "learning_rate": 0.0001306273062730627, + "loss": 0.546, + "step": 118 + }, + { + "epoch": 0.1320205241991402, + "grad_norm": 0.16707932949066162, + "learning_rate": 0.0001317343173431734, + "loss": 0.4935, + "step": 119 + }, + { + "epoch": 0.13312994036888087, + "grad_norm": 0.17367342114448547, + "learning_rate": 0.00013284132841328411, + "loss": 0.5853, + "step": 120 + }, + { + "epoch": 0.13423935653862154, + "grad_norm": 0.20932604372501373, + "learning_rate": 0.00013394833948339482, + "loss": 0.5861, + "step": 121 + }, + { + "epoch": 0.13534877270836224, + "grad_norm": 0.19261795282363892, + "learning_rate": 0.00013505535055350552, + "loss": 0.5912, + "step": 122 + }, + { + "epoch": 0.1364581888781029, + "grad_norm": 0.16762207448482513, + "learning_rate": 0.00013616236162361623, + "loss": 0.6856, + "step": 123 + }, + { + "epoch": 0.13756760504784357, + "grad_norm": 0.22033065557479858, + "learning_rate": 0.00013726937269372693, + "loss": 0.6217, + "step": 124 + }, + { + "epoch": 0.13867702121758424, + "grad_norm": 0.1888565719127655, + "learning_rate": 0.00013837638376383763, + "loss": 0.4837, + "step": 125 + }, + { + "epoch": 0.1397864373873249, + "grad_norm": 0.13172639906406403, + "learning_rate": 0.0001394833948339483, + "loss": 0.4298, + "step": 126 + }, + { + "epoch": 0.1408958535570656, + "grad_norm": 0.20497459173202515, + "learning_rate": 0.00014059040590405902, + "loss": 0.7032, + "step": 127 + }, + { + "epoch": 0.14200526972680627, + "grad_norm": 0.6339800357818604, + "learning_rate": 0.00014169741697416972, + "loss": 0.5669, + "step": 128 + }, + { + "epoch": 0.14311468589654694, + "grad_norm": 0.25784072279930115, + "learning_rate": 0.00014280442804428042, + "loss": 0.6115, + "step": 129 + }, + { + "epoch": 0.1442241020662876, + "grad_norm": 0.1341128647327423, + "learning_rate": 0.00014391143911439113, + "loss": 0.5487, + "step": 130 + }, + { + "epoch": 0.1453335182360283, + "grad_norm": 0.15084423124790192, + "learning_rate": 0.00014501845018450183, + "loss": 0.5858, + "step": 131 + }, + { + "epoch": 0.14644293440576897, + "grad_norm": 0.3109219968318939, + "learning_rate": 0.00014612546125461254, + "loss": 0.5663, + "step": 132 + }, + { + "epoch": 0.14755235057550964, + "grad_norm": 0.20024478435516357, + "learning_rate": 0.00014723247232472324, + "loss": 0.49, + "step": 133 + }, + { + "epoch": 0.1486617667452503, + "grad_norm": 0.2169143110513687, + "learning_rate": 0.00014833948339483394, + "loss": 0.5955, + "step": 134 + }, + { + "epoch": 0.14977118291499097, + "grad_norm": 0.12287949025630951, + "learning_rate": 0.00014944649446494465, + "loss": 0.5406, + "step": 135 + }, + { + "epoch": 0.15088059908473167, + "grad_norm": 0.1676974594593048, + "learning_rate": 0.00015055350553505533, + "loss": 0.5796, + "step": 136 + }, + { + "epoch": 0.15199001525447234, + "grad_norm": 0.18620753288269043, + "learning_rate": 0.00015166051660516606, + "loss": 0.4466, + "step": 137 + }, + { + "epoch": 0.153099431424213, + "grad_norm": 0.2430751472711563, + "learning_rate": 0.00015276752767527673, + "loss": 0.5272, + "step": 138 + }, + { + "epoch": 0.15420884759395367, + "grad_norm": 0.21565105020999908, + "learning_rate": 0.00015387453874538746, + "loss": 0.7445, + "step": 139 + }, + { + "epoch": 0.15531826376369437, + "grad_norm": 0.14998437464237213, + "learning_rate": 0.00015498154981549814, + "loss": 0.7069, + "step": 140 + }, + { + "epoch": 0.15642767993343504, + "grad_norm": 0.22991682589054108, + "learning_rate": 0.00015608856088560882, + "loss": 0.5109, + "step": 141 + }, + { + "epoch": 0.1575370961031757, + "grad_norm": 0.21704424917697906, + "learning_rate": 0.00015719557195571955, + "loss": 0.5226, + "step": 142 + }, + { + "epoch": 0.15864651227291637, + "grad_norm": 0.141441211104393, + "learning_rate": 0.00015830258302583023, + "loss": 0.6123, + "step": 143 + }, + { + "epoch": 0.15975592844265704, + "grad_norm": 0.14867329597473145, + "learning_rate": 0.00015940959409594096, + "loss": 0.606, + "step": 144 + }, + { + "epoch": 0.16086534461239774, + "grad_norm": 0.37929290533065796, + "learning_rate": 0.00016051660516605164, + "loss": 0.5971, + "step": 145 + }, + { + "epoch": 0.1619747607821384, + "grad_norm": 0.19486796855926514, + "learning_rate": 0.00016162361623616237, + "loss": 0.554, + "step": 146 + }, + { + "epoch": 0.16308417695187907, + "grad_norm": 0.16117985546588898, + "learning_rate": 0.00016273062730627304, + "loss": 0.5379, + "step": 147 + }, + { + "epoch": 0.16419359312161974, + "grad_norm": 0.21857589483261108, + "learning_rate": 0.00016383763837638377, + "loss": 0.6984, + "step": 148 + }, + { + "epoch": 0.1653030092913604, + "grad_norm": 0.18841134011745453, + "learning_rate": 0.00016494464944649445, + "loss": 0.7714, + "step": 149 + }, + { + "epoch": 0.1664124254611011, + "grad_norm": 0.16624371707439423, + "learning_rate": 0.00016605166051660516, + "loss": 0.5173, + "step": 150 + }, + { + "epoch": 0.16752184163084177, + "grad_norm": 0.17951254546642303, + "learning_rate": 0.00016715867158671586, + "loss": 0.8636, + "step": 151 + }, + { + "epoch": 0.16863125780058244, + "grad_norm": 0.1839190572500229, + "learning_rate": 0.00016826568265682654, + "loss": 0.4827, + "step": 152 + }, + { + "epoch": 0.1697406739703231, + "grad_norm": 0.12314256280660629, + "learning_rate": 0.00016937269372693727, + "loss": 0.512, + "step": 153 + }, + { + "epoch": 0.1708500901400638, + "grad_norm": 0.12132935225963593, + "learning_rate": 0.00017047970479704795, + "loss": 0.5284, + "step": 154 + }, + { + "epoch": 0.17195950630980447, + "grad_norm": 0.29458367824554443, + "learning_rate": 0.00017158671586715868, + "loss": 0.703, + "step": 155 + }, + { + "epoch": 0.17306892247954514, + "grad_norm": 0.15419234335422516, + "learning_rate": 0.00017269372693726935, + "loss": 0.6275, + "step": 156 + }, + { + "epoch": 0.1741783386492858, + "grad_norm": 0.12458593398332596, + "learning_rate": 0.00017380073800738006, + "loss": 0.4877, + "step": 157 + }, + { + "epoch": 0.17528775481902648, + "grad_norm": 0.1632385402917862, + "learning_rate": 0.00017490774907749076, + "loss": 0.5432, + "step": 158 + }, + { + "epoch": 0.17639717098876717, + "grad_norm": 0.12995319068431854, + "learning_rate": 0.00017601476014760147, + "loss": 0.5964, + "step": 159 + }, + { + "epoch": 0.17750658715850784, + "grad_norm": 0.09651859104633331, + "learning_rate": 0.00017712177121771217, + "loss": 0.5907, + "step": 160 + }, + { + "epoch": 0.1786160033282485, + "grad_norm": 0.20671844482421875, + "learning_rate": 0.00017822878228782285, + "loss": 0.574, + "step": 161 + }, + { + "epoch": 0.17972541949798918, + "grad_norm": 0.1272733509540558, + "learning_rate": 0.00017933579335793358, + "loss": 0.5413, + "step": 162 + }, + { + "epoch": 0.18083483566772987, + "grad_norm": 0.12072700262069702, + "learning_rate": 0.00018044280442804426, + "loss": 0.4555, + "step": 163 + }, + { + "epoch": 0.18194425183747054, + "grad_norm": 0.18685540556907654, + "learning_rate": 0.00018154981549815496, + "loss": 0.7267, + "step": 164 + }, + { + "epoch": 0.1830536680072112, + "grad_norm": 0.18687789142131805, + "learning_rate": 0.00018265682656826566, + "loss": 0.5785, + "step": 165 + }, + { + "epoch": 0.18416308417695187, + "grad_norm": 0.2580098807811737, + "learning_rate": 0.00018376383763837637, + "loss": 0.5662, + "step": 166 + }, + { + "epoch": 0.18527250034669254, + "grad_norm": 0.1476382166147232, + "learning_rate": 0.00018487084870848707, + "loss": 0.5427, + "step": 167 + }, + { + "epoch": 0.18638191651643324, + "grad_norm": 0.14074724912643433, + "learning_rate": 0.00018597785977859778, + "loss": 0.4759, + "step": 168 + }, + { + "epoch": 0.1874913326861739, + "grad_norm": 0.2741679847240448, + "learning_rate": 0.00018708487084870848, + "loss": 0.6973, + "step": 169 + }, + { + "epoch": 0.18860074885591457, + "grad_norm": 0.2278490662574768, + "learning_rate": 0.00018819188191881918, + "loss": 0.586, + "step": 170 + }, + { + "epoch": 0.18971016502565524, + "grad_norm": 0.1516968458890915, + "learning_rate": 0.00018929889298892986, + "loss": 0.378, + "step": 171 + }, + { + "epoch": 0.1908195811953959, + "grad_norm": 0.17562097311019897, + "learning_rate": 0.00019040590405904056, + "loss": 0.6113, + "step": 172 + }, + { + "epoch": 0.1919289973651366, + "grad_norm": 0.14954030513763428, + "learning_rate": 0.00019151291512915127, + "loss": 0.4926, + "step": 173 + }, + { + "epoch": 0.19303841353487727, + "grad_norm": 0.1906110644340515, + "learning_rate": 0.00019261992619926197, + "loss": 0.4884, + "step": 174 + }, + { + "epoch": 0.19414782970461794, + "grad_norm": 0.18351547420024872, + "learning_rate": 0.00019372693726937268, + "loss": 0.5295, + "step": 175 + }, + { + "epoch": 0.1952572458743586, + "grad_norm": 0.2049722820520401, + "learning_rate": 0.00019483394833948338, + "loss": 0.4562, + "step": 176 + }, + { + "epoch": 0.1963666620440993, + "grad_norm": 0.21823738515377045, + "learning_rate": 0.00019594095940959409, + "loss": 0.5329, + "step": 177 + }, + { + "epoch": 0.19747607821383997, + "grad_norm": 0.1077185720205307, + "learning_rate": 0.00019704797047970476, + "loss": 0.4787, + "step": 178 + }, + { + "epoch": 0.19858549438358064, + "grad_norm": 0.19151321053504944, + "learning_rate": 0.0001981549815498155, + "loss": 0.7764, + "step": 179 + }, + { + "epoch": 0.1996949105533213, + "grad_norm": 0.16796617209911346, + "learning_rate": 0.00019926199261992617, + "loss": 0.4962, + "step": 180 + }, + { + "epoch": 0.20080432672306198, + "grad_norm": 0.31866180896759033, + "learning_rate": 0.00020036900369003687, + "loss": 0.8462, + "step": 181 + }, + { + "epoch": 0.20191374289280267, + "grad_norm": 0.2093004584312439, + "learning_rate": 0.00020147601476014758, + "loss": 0.5916, + "step": 182 + }, + { + "epoch": 0.20302315906254334, + "grad_norm": 0.2198261171579361, + "learning_rate": 0.00020258302583025828, + "loss": 0.434, + "step": 183 + }, + { + "epoch": 0.204132575232284, + "grad_norm": 0.19753794372081757, + "learning_rate": 0.000203690036900369, + "loss": 0.6142, + "step": 184 + }, + { + "epoch": 0.20524199140202468, + "grad_norm": 0.1537386178970337, + "learning_rate": 0.00020479704797047966, + "loss": 0.4053, + "step": 185 + }, + { + "epoch": 0.20635140757176537, + "grad_norm": 0.194119393825531, + "learning_rate": 0.0002059040590405904, + "loss": 0.6506, + "step": 186 + }, + { + "epoch": 0.20746082374150604, + "grad_norm": 0.16899624466896057, + "learning_rate": 0.00020701107011070107, + "loss": 0.572, + "step": 187 + }, + { + "epoch": 0.2085702399112467, + "grad_norm": 0.14211402833461761, + "learning_rate": 0.0002081180811808118, + "loss": 0.5999, + "step": 188 + }, + { + "epoch": 0.20967965608098738, + "grad_norm": 0.17124126851558685, + "learning_rate": 0.00020922509225092248, + "loss": 0.4797, + "step": 189 + }, + { + "epoch": 0.21078907225072804, + "grad_norm": 0.1637413501739502, + "learning_rate": 0.0002103321033210332, + "loss": 0.3899, + "step": 190 + }, + { + "epoch": 0.21189848842046874, + "grad_norm": 0.2928311228752136, + "learning_rate": 0.0002114391143911439, + "loss": 0.4297, + "step": 191 + }, + { + "epoch": 0.2130079045902094, + "grad_norm": 0.19800159335136414, + "learning_rate": 0.0002125461254612546, + "loss": 0.6641, + "step": 192 + }, + { + "epoch": 0.21411732075995007, + "grad_norm": 0.1297946572303772, + "learning_rate": 0.0002136531365313653, + "loss": 0.4557, + "step": 193 + }, + { + "epoch": 0.21522673692969074, + "grad_norm": 0.24120758473873138, + "learning_rate": 0.00021476014760147597, + "loss": 0.7058, + "step": 194 + }, + { + "epoch": 0.2163361530994314, + "grad_norm": 0.15839487314224243, + "learning_rate": 0.0002158671586715867, + "loss": 0.5106, + "step": 195 + }, + { + "epoch": 0.2174455692691721, + "grad_norm": 0.18157745897769928, + "learning_rate": 0.00021697416974169738, + "loss": 0.8263, + "step": 196 + }, + { + "epoch": 0.21855498543891277, + "grad_norm": 0.2212788164615631, + "learning_rate": 0.0002180811808118081, + "loss": 0.4721, + "step": 197 + }, + { + "epoch": 0.21966440160865344, + "grad_norm": 0.2079571783542633, + "learning_rate": 0.0002191881918819188, + "loss": 0.5491, + "step": 198 + }, + { + "epoch": 0.2207738177783941, + "grad_norm": 0.29977646470069885, + "learning_rate": 0.00022029520295202952, + "loss": 0.5165, + "step": 199 + }, + { + "epoch": 0.2218832339481348, + "grad_norm": 0.15032988786697388, + "learning_rate": 0.0002214022140221402, + "loss": 0.5833, + "step": 200 + }, + { + "epoch": 0.22299265011787547, + "grad_norm": 0.1367197334766388, + "learning_rate": 0.00022250922509225088, + "loss": 0.4895, + "step": 201 + }, + { + "epoch": 0.22410206628761614, + "grad_norm": 0.20162621140480042, + "learning_rate": 0.0002236162361623616, + "loss": 0.6069, + "step": 202 + }, + { + "epoch": 0.2252114824573568, + "grad_norm": 0.1476079821586609, + "learning_rate": 0.00022472324723247228, + "loss": 0.3195, + "step": 203 + }, + { + "epoch": 0.22632089862709748, + "grad_norm": 0.16391322016716003, + "learning_rate": 0.00022583025830258302, + "loss": 0.5242, + "step": 204 + }, + { + "epoch": 0.22743031479683817, + "grad_norm": 0.32637152075767517, + "learning_rate": 0.0002269372693726937, + "loss": 0.608, + "step": 205 + }, + { + "epoch": 0.22853973096657884, + "grad_norm": 0.18414971232414246, + "learning_rate": 0.00022804428044280442, + "loss": 0.5414, + "step": 206 + }, + { + "epoch": 0.2296491471363195, + "grad_norm": 0.13513615727424622, + "learning_rate": 0.0002291512915129151, + "loss": 0.452, + "step": 207 + }, + { + "epoch": 0.23075856330606018, + "grad_norm": 0.1715661883354187, + "learning_rate": 0.00023025830258302583, + "loss": 0.5552, + "step": 208 + }, + { + "epoch": 0.23186797947580087, + "grad_norm": 0.2078085094690323, + "learning_rate": 0.0002313653136531365, + "loss": 0.6519, + "step": 209 + }, + { + "epoch": 0.23297739564554154, + "grad_norm": 0.45334669947624207, + "learning_rate": 0.00023247232472324724, + "loss": 0.9767, + "step": 210 + }, + { + "epoch": 0.2340868118152822, + "grad_norm": 0.17819558084011078, + "learning_rate": 0.00023357933579335792, + "loss": 0.7568, + "step": 211 + }, + { + "epoch": 0.23519622798502288, + "grad_norm": 0.4460853338241577, + "learning_rate": 0.0002346863468634686, + "loss": 0.4265, + "step": 212 + }, + { + "epoch": 0.23630564415476354, + "grad_norm": 0.2014990597963333, + "learning_rate": 0.00023579335793357933, + "loss": 0.6479, + "step": 213 + }, + { + "epoch": 0.23741506032450424, + "grad_norm": 0.17922167479991913, + "learning_rate": 0.00023690036900369, + "loss": 0.6739, + "step": 214 + }, + { + "epoch": 0.2385244764942449, + "grad_norm": 0.14204619824886322, + "learning_rate": 0.00023800738007380073, + "loss": 0.4641, + "step": 215 + }, + { + "epoch": 0.23963389266398558, + "grad_norm": 0.17847496271133423, + "learning_rate": 0.0002391143911439114, + "loss": 0.6229, + "step": 216 + }, + { + "epoch": 0.24074330883372624, + "grad_norm": 0.20707765221595764, + "learning_rate": 0.00024022140221402214, + "loss": 0.6337, + "step": 217 + }, + { + "epoch": 0.2418527250034669, + "grad_norm": 0.1623317301273346, + "learning_rate": 0.00024132841328413282, + "loss": 0.5256, + "step": 218 + }, + { + "epoch": 0.2429621411732076, + "grad_norm": 0.18768182396888733, + "learning_rate": 0.00024243542435424352, + "loss": 0.5793, + "step": 219 + }, + { + "epoch": 0.24407155734294828, + "grad_norm": 0.16495372354984283, + "learning_rate": 0.00024354243542435423, + "loss": 0.4539, + "step": 220 + }, + { + "epoch": 0.24518097351268894, + "grad_norm": 0.14188359677791595, + "learning_rate": 0.00024464944649446493, + "loss": 0.5089, + "step": 221 + }, + { + "epoch": 0.2462903896824296, + "grad_norm": 0.18091897666454315, + "learning_rate": 0.0002457564575645756, + "loss": 0.5689, + "step": 222 + }, + { + "epoch": 0.2473998058521703, + "grad_norm": 0.214166522026062, + "learning_rate": 0.00024686346863468634, + "loss": 0.6479, + "step": 223 + }, + { + "epoch": 0.24850922202191097, + "grad_norm": 0.19823165237903595, + "learning_rate": 0.000247970479704797, + "loss": 0.4772, + "step": 224 + }, + { + "epoch": 0.24961863819165164, + "grad_norm": 0.20246858894824982, + "learning_rate": 0.0002490774907749077, + "loss": 0.6838, + "step": 225 + }, + { + "epoch": 0.25072805436139234, + "grad_norm": 0.17839235067367554, + "learning_rate": 0.0002501845018450184, + "loss": 0.6234, + "step": 226 + }, + { + "epoch": 0.251837470531133, + "grad_norm": 0.15513737499713898, + "learning_rate": 0.0002512915129151291, + "loss": 0.6836, + "step": 227 + }, + { + "epoch": 0.2529468867008737, + "grad_norm": 0.3762029707431793, + "learning_rate": 0.00025239852398523983, + "loss": 0.6588, + "step": 228 + }, + { + "epoch": 0.2540563028706143, + "grad_norm": 0.24218355119228363, + "learning_rate": 0.0002535055350553505, + "loss": 0.663, + "step": 229 + }, + { + "epoch": 0.255165719040355, + "grad_norm": 0.23746894299983978, + "learning_rate": 0.00025461254612546124, + "loss": 0.8055, + "step": 230 + }, + { + "epoch": 0.2562751352100957, + "grad_norm": 0.19097627699375153, + "learning_rate": 0.0002557195571955719, + "loss": 0.5028, + "step": 231 + }, + { + "epoch": 0.25738455137983635, + "grad_norm": 0.16248895227909088, + "learning_rate": 0.0002568265682656826, + "loss": 0.4644, + "step": 232 + }, + { + "epoch": 0.25849396754957704, + "grad_norm": 0.15660254657268524, + "learning_rate": 0.0002579335793357933, + "loss": 0.4822, + "step": 233 + }, + { + "epoch": 0.2596033837193177, + "grad_norm": 0.18094629049301147, + "learning_rate": 0.000259040590405904, + "loss": 0.6761, + "step": 234 + }, + { + "epoch": 0.2607127998890584, + "grad_norm": 0.20552562177181244, + "learning_rate": 0.00026014760147601473, + "loss": 0.654, + "step": 235 + }, + { + "epoch": 0.2618222160587991, + "grad_norm": 0.1620112657546997, + "learning_rate": 0.0002612546125461254, + "loss": 0.3786, + "step": 236 + }, + { + "epoch": 0.2629316322285397, + "grad_norm": 0.25460562109947205, + "learning_rate": 0.00026236162361623614, + "loss": 0.6258, + "step": 237 + }, + { + "epoch": 0.2640410483982804, + "grad_norm": 0.17765319347381592, + "learning_rate": 0.0002634686346863468, + "loss": 0.3955, + "step": 238 + }, + { + "epoch": 0.2651504645680211, + "grad_norm": 0.2620762884616852, + "learning_rate": 0.00026457564575645755, + "loss": 0.5884, + "step": 239 + }, + { + "epoch": 0.26625988073776174, + "grad_norm": 0.14502941071987152, + "learning_rate": 0.00026568265682656823, + "loss": 0.5391, + "step": 240 + }, + { + "epoch": 0.26736929690750244, + "grad_norm": 0.15013466775417328, + "learning_rate": 0.0002667896678966789, + "loss": 0.5676, + "step": 241 + }, + { + "epoch": 0.2684787130772431, + "grad_norm": 0.2311590611934662, + "learning_rate": 0.00026789667896678964, + "loss": 0.6409, + "step": 242 + }, + { + "epoch": 0.2695881292469838, + "grad_norm": 0.16972240805625916, + "learning_rate": 0.0002690036900369003, + "loss": 0.488, + "step": 243 + }, + { + "epoch": 0.27069754541672447, + "grad_norm": 0.13695970177650452, + "learning_rate": 0.00027011070110701104, + "loss": 0.5997, + "step": 244 + }, + { + "epoch": 0.2718069615864651, + "grad_norm": 0.16296394169330597, + "learning_rate": 0.0002712177121771217, + "loss": 0.5645, + "step": 245 + }, + { + "epoch": 0.2729163777562058, + "grad_norm": 0.15607796609401703, + "learning_rate": 0.00027232472324723245, + "loss": 0.3976, + "step": 246 + }, + { + "epoch": 0.27402579392594645, + "grad_norm": 0.17693723738193512, + "learning_rate": 0.00027343173431734313, + "loss": 0.6873, + "step": 247 + }, + { + "epoch": 0.27513521009568714, + "grad_norm": 0.16682426631450653, + "learning_rate": 0.00027453874538745386, + "loss": 0.5259, + "step": 248 + }, + { + "epoch": 0.27624462626542784, + "grad_norm": 0.16855113208293915, + "learning_rate": 0.00027564575645756454, + "loss": 0.5889, + "step": 249 + }, + { + "epoch": 0.2773540424351685, + "grad_norm": 0.160100057721138, + "learning_rate": 0.00027675276752767527, + "loss": 0.5832, + "step": 250 + }, + { + "epoch": 0.2784634586049092, + "grad_norm": 0.24193750321865082, + "learning_rate": 0.00027785977859778595, + "loss": 0.4325, + "step": 251 + }, + { + "epoch": 0.2795728747746498, + "grad_norm": 0.12794721126556396, + "learning_rate": 0.0002789667896678966, + "loss": 0.7371, + "step": 252 + }, + { + "epoch": 0.2806822909443905, + "grad_norm": 0.22222159802913666, + "learning_rate": 0.00028007380073800735, + "loss": 0.6177, + "step": 253 + }, + { + "epoch": 0.2817917071141312, + "grad_norm": 0.24400316178798676, + "learning_rate": 0.00028118081180811803, + "loss": 0.5169, + "step": 254 + }, + { + "epoch": 0.28290112328387185, + "grad_norm": 0.23530688881874084, + "learning_rate": 0.00028228782287822876, + "loss": 0.4384, + "step": 255 + }, + { + "epoch": 0.28401053945361254, + "grad_norm": 0.20001864433288574, + "learning_rate": 0.00028339483394833944, + "loss": 0.4855, + "step": 256 + }, + { + "epoch": 0.2851199556233532, + "grad_norm": 0.1441182792186737, + "learning_rate": 0.00028450184501845017, + "loss": 0.5467, + "step": 257 + }, + { + "epoch": 0.2862293717930939, + "grad_norm": 0.17499203979969025, + "learning_rate": 0.00028560885608856085, + "loss": 0.6093, + "step": 258 + }, + { + "epoch": 0.2873387879628346, + "grad_norm": 0.13634347915649414, + "learning_rate": 0.0002867158671586716, + "loss": 0.5183, + "step": 259 + }, + { + "epoch": 0.2884482041325752, + "grad_norm": 0.1825854778289795, + "learning_rate": 0.00028782287822878226, + "loss": 0.524, + "step": 260 + }, + { + "epoch": 0.2895576203023159, + "grad_norm": 0.1797315776348114, + "learning_rate": 0.00028892988929889293, + "loss": 0.9663, + "step": 261 + }, + { + "epoch": 0.2906670364720566, + "grad_norm": 0.22019031643867493, + "learning_rate": 0.00029003690036900366, + "loss": 0.8477, + "step": 262 + }, + { + "epoch": 0.29177645264179725, + "grad_norm": 0.18698155879974365, + "learning_rate": 0.00029114391143911434, + "loss": 0.5928, + "step": 263 + }, + { + "epoch": 0.29288586881153794, + "grad_norm": 0.18208318948745728, + "learning_rate": 0.00029225092250922507, + "loss": 0.5586, + "step": 264 + }, + { + "epoch": 0.2939952849812786, + "grad_norm": 0.17802134156227112, + "learning_rate": 0.00029335793357933575, + "loss": 0.499, + "step": 265 + }, + { + "epoch": 0.2951047011510193, + "grad_norm": 0.25552839040756226, + "learning_rate": 0.0002944649446494465, + "loss": 0.6052, + "step": 266 + }, + { + "epoch": 0.29621411732076, + "grad_norm": 0.15605288743972778, + "learning_rate": 0.00029557195571955716, + "loss": 0.4767, + "step": 267 + }, + { + "epoch": 0.2973235334905006, + "grad_norm": 0.18597030639648438, + "learning_rate": 0.0002966789667896679, + "loss": 0.6442, + "step": 268 + }, + { + "epoch": 0.2984329496602413, + "grad_norm": 0.2584044933319092, + "learning_rate": 0.00029778597785977857, + "loss": 0.6259, + "step": 269 + }, + { + "epoch": 0.29954236582998195, + "grad_norm": 0.13973256945610046, + "learning_rate": 0.0002988929889298893, + "loss": 0.6127, + "step": 270 + }, + { + "epoch": 0.30065178199972264, + "grad_norm": 0.21028122305870056, + "learning_rate": 0.0003, + "loss": 0.3969, + "step": 271 + }, + { + "epoch": 0.30176119816946334, + "grad_norm": 0.232606902718544, + "learning_rate": 0.00029987664473684207, + "loss": 0.6801, + "step": 272 + }, + { + "epoch": 0.302870614339204, + "grad_norm": 0.24834930896759033, + "learning_rate": 0.00029975328947368416, + "loss": 0.5395, + "step": 273 + }, + { + "epoch": 0.3039800305089447, + "grad_norm": 0.2343815118074417, + "learning_rate": 0.00029962993421052625, + "loss": 0.6029, + "step": 274 + }, + { + "epoch": 0.3050894466786853, + "grad_norm": 0.23860520124435425, + "learning_rate": 0.0002995065789473684, + "loss": 0.4872, + "step": 275 + }, + { + "epoch": 0.306198862848426, + "grad_norm": 0.1793919801712036, + "learning_rate": 0.0002993832236842105, + "loss": 0.6896, + "step": 276 + }, + { + "epoch": 0.3073082790181667, + "grad_norm": 0.2507120668888092, + "learning_rate": 0.00029925986842105264, + "loss": 0.3792, + "step": 277 + }, + { + "epoch": 0.30841769518790735, + "grad_norm": 0.27677059173583984, + "learning_rate": 0.00029913651315789473, + "loss": 0.7856, + "step": 278 + }, + { + "epoch": 0.30952711135764804, + "grad_norm": 0.18887469172477722, + "learning_rate": 0.00029901315789473683, + "loss": 0.5406, + "step": 279 + }, + { + "epoch": 0.31063652752738874, + "grad_norm": 0.23371614515781403, + "learning_rate": 0.0002988898026315789, + "loss": 0.5748, + "step": 280 + }, + { + "epoch": 0.3117459436971294, + "grad_norm": 0.22486557066440582, + "learning_rate": 0.000298766447368421, + "loss": 0.7443, + "step": 281 + }, + { + "epoch": 0.3128553598668701, + "grad_norm": 0.17992804944515228, + "learning_rate": 0.0002986430921052631, + "loss": 0.5819, + "step": 282 + }, + { + "epoch": 0.3139647760366107, + "grad_norm": 0.20137208700180054, + "learning_rate": 0.00029851973684210525, + "loss": 0.722, + "step": 283 + }, + { + "epoch": 0.3150741922063514, + "grad_norm": 0.25975537300109863, + "learning_rate": 0.00029839638157894735, + "loss": 0.5349, + "step": 284 + }, + { + "epoch": 0.3161836083760921, + "grad_norm": 0.2687530219554901, + "learning_rate": 0.00029827302631578944, + "loss": 0.4361, + "step": 285 + }, + { + "epoch": 0.31729302454583275, + "grad_norm": 0.18794257938861847, + "learning_rate": 0.0002981496710526316, + "loss": 0.5822, + "step": 286 + }, + { + "epoch": 0.31840244071557344, + "grad_norm": 0.12230537086725235, + "learning_rate": 0.0002980263157894737, + "loss": 0.3639, + "step": 287 + }, + { + "epoch": 0.3195118568853141, + "grad_norm": 0.20607517659664154, + "learning_rate": 0.0002979029605263158, + "loss": 0.6476, + "step": 288 + }, + { + "epoch": 0.3206212730550548, + "grad_norm": 0.11566425859928131, + "learning_rate": 0.00029777960526315787, + "loss": 0.5026, + "step": 289 + }, + { + "epoch": 0.3217306892247955, + "grad_norm": 0.22872845828533173, + "learning_rate": 0.00029765624999999996, + "loss": 0.5727, + "step": 290 + }, + { + "epoch": 0.3228401053945361, + "grad_norm": 0.17686223983764648, + "learning_rate": 0.00029753289473684205, + "loss": 0.6867, + "step": 291 + }, + { + "epoch": 0.3239495215642768, + "grad_norm": 0.2232068032026291, + "learning_rate": 0.0002974095394736842, + "loss": 0.4983, + "step": 292 + }, + { + "epoch": 0.32505893773401745, + "grad_norm": 0.26865360140800476, + "learning_rate": 0.0002972861842105263, + "loss": 0.5151, + "step": 293 + }, + { + "epoch": 0.32616835390375815, + "grad_norm": 0.14152151346206665, + "learning_rate": 0.0002971628289473684, + "loss": 0.7316, + "step": 294 + }, + { + "epoch": 0.32727777007349884, + "grad_norm": 0.20797161757946014, + "learning_rate": 0.00029703947368421054, + "loss": 0.4479, + "step": 295 + }, + { + "epoch": 0.3283871862432395, + "grad_norm": 0.1608234941959381, + "learning_rate": 0.00029691611842105263, + "loss": 0.5482, + "step": 296 + }, + { + "epoch": 0.3294966024129802, + "grad_norm": 0.2219133973121643, + "learning_rate": 0.0002967927631578947, + "loss": 0.4474, + "step": 297 + }, + { + "epoch": 0.3306060185827208, + "grad_norm": 0.3990642726421356, + "learning_rate": 0.0002966694078947368, + "loss": 0.6421, + "step": 298 + }, + { + "epoch": 0.3317154347524615, + "grad_norm": 0.20786860585212708, + "learning_rate": 0.0002965460526315789, + "loss": 0.5569, + "step": 299 + }, + { + "epoch": 0.3328248509222022, + "grad_norm": 0.1967337280511856, + "learning_rate": 0.000296422697368421, + "loss": 0.6013, + "step": 300 + }, + { + "epoch": 0.33393426709194285, + "grad_norm": 0.19602453708648682, + "learning_rate": 0.00029629934210526315, + "loss": 0.5516, + "step": 301 + }, + { + "epoch": 0.33504368326168354, + "grad_norm": 0.23872393369674683, + "learning_rate": 0.00029617598684210524, + "loss": 0.5124, + "step": 302 + }, + { + "epoch": 0.33615309943142424, + "grad_norm": 0.20506146550178528, + "learning_rate": 0.00029605263157894733, + "loss": 0.6206, + "step": 303 + }, + { + "epoch": 0.3372625156011649, + "grad_norm": 0.20456762611865997, + "learning_rate": 0.00029592927631578943, + "loss": 0.4852, + "step": 304 + }, + { + "epoch": 0.3383719317709056, + "grad_norm": 0.23360048234462738, + "learning_rate": 0.0002958059210526316, + "loss": 0.4821, + "step": 305 + }, + { + "epoch": 0.3394813479406462, + "grad_norm": 0.16443900763988495, + "learning_rate": 0.00029568256578947367, + "loss": 0.4089, + "step": 306 + }, + { + "epoch": 0.3405907641103869, + "grad_norm": 0.15696674585342407, + "learning_rate": 0.00029555921052631576, + "loss": 0.5479, + "step": 307 + }, + { + "epoch": 0.3417001802801276, + "grad_norm": 0.20780624449253082, + "learning_rate": 0.00029543585526315785, + "loss": 0.4644, + "step": 308 + }, + { + "epoch": 0.34280959644986825, + "grad_norm": 0.185526043176651, + "learning_rate": 0.00029531249999999995, + "loss": 0.7259, + "step": 309 + }, + { + "epoch": 0.34391901261960894, + "grad_norm": 0.1540479063987732, + "learning_rate": 0.0002951891447368421, + "loss": 0.6167, + "step": 310 + }, + { + "epoch": 0.3450284287893496, + "grad_norm": 0.26093733310699463, + "learning_rate": 0.0002950657894736842, + "loss": 0.6427, + "step": 311 + }, + { + "epoch": 0.3461378449590903, + "grad_norm": 0.21596834063529968, + "learning_rate": 0.0002949424342105263, + "loss": 0.5916, + "step": 312 + }, + { + "epoch": 0.347247261128831, + "grad_norm": 0.20977520942687988, + "learning_rate": 0.0002948190789473684, + "loss": 0.6572, + "step": 313 + }, + { + "epoch": 0.3483566772985716, + "grad_norm": 0.1886155605316162, + "learning_rate": 0.0002946957236842105, + "loss": 0.5399, + "step": 314 + }, + { + "epoch": 0.3494660934683123, + "grad_norm": 0.18215329945087433, + "learning_rate": 0.0002945723684210526, + "loss": 0.5724, + "step": 315 + }, + { + "epoch": 0.35057550963805295, + "grad_norm": 0.2055482119321823, + "learning_rate": 0.0002944490131578947, + "loss": 0.5834, + "step": 316 + }, + { + "epoch": 0.35168492580779365, + "grad_norm": 0.15489786863327026, + "learning_rate": 0.0002943256578947368, + "loss": 0.5003, + "step": 317 + }, + { + "epoch": 0.35279434197753434, + "grad_norm": 0.17881543934345245, + "learning_rate": 0.00029420230263157895, + "loss": 0.5105, + "step": 318 + }, + { + "epoch": 0.353903758147275, + "grad_norm": 0.1768861562013626, + "learning_rate": 0.00029407894736842104, + "loss": 0.5363, + "step": 319 + }, + { + "epoch": 0.3550131743170157, + "grad_norm": 0.2283925861120224, + "learning_rate": 0.00029395559210526314, + "loss": 0.5452, + "step": 320 + }, + { + "epoch": 0.3561225904867563, + "grad_norm": 0.19439998269081116, + "learning_rate": 0.00029383223684210523, + "loss": 0.5603, + "step": 321 + }, + { + "epoch": 0.357232006656497, + "grad_norm": 0.2097710222005844, + "learning_rate": 0.0002937088815789473, + "loss": 0.625, + "step": 322 + }, + { + "epoch": 0.3583414228262377, + "grad_norm": 0.186342254281044, + "learning_rate": 0.0002935855263157894, + "loss": 0.3553, + "step": 323 + }, + { + "epoch": 0.35945083899597835, + "grad_norm": 0.275612473487854, + "learning_rate": 0.00029346217105263156, + "loss": 0.6274, + "step": 324 + }, + { + "epoch": 0.36056025516571905, + "grad_norm": 0.19332240521907806, + "learning_rate": 0.00029333881578947366, + "loss": 0.5827, + "step": 325 + }, + { + "epoch": 0.36166967133545974, + "grad_norm": 0.18259958922863007, + "learning_rate": 0.00029321546052631575, + "loss": 0.4502, + "step": 326 + }, + { + "epoch": 0.3627790875052004, + "grad_norm": 0.25983044505119324, + "learning_rate": 0.0002930921052631579, + "loss": 0.8896, + "step": 327 + }, + { + "epoch": 0.3638885036749411, + "grad_norm": 0.1671958863735199, + "learning_rate": 0.00029296875, + "loss": 0.5232, + "step": 328 + }, + { + "epoch": 0.3649979198446817, + "grad_norm": 0.18751101195812225, + "learning_rate": 0.0002928453947368421, + "loss": 0.4556, + "step": 329 + }, + { + "epoch": 0.3661073360144224, + "grad_norm": 0.19776886701583862, + "learning_rate": 0.0002927220394736842, + "loss": 0.8467, + "step": 330 + }, + { + "epoch": 0.3672167521841631, + "grad_norm": 0.2738226354122162, + "learning_rate": 0.00029259868421052627, + "loss": 0.7323, + "step": 331 + }, + { + "epoch": 0.36832616835390375, + "grad_norm": 0.16836367547512054, + "learning_rate": 0.00029247532894736836, + "loss": 0.43, + "step": 332 + }, + { + "epoch": 0.36943558452364444, + "grad_norm": 0.30866721272468567, + "learning_rate": 0.0002923519736842105, + "loss": 0.3759, + "step": 333 + }, + { + "epoch": 0.3705450006933851, + "grad_norm": 0.20256434381008148, + "learning_rate": 0.0002922286184210526, + "loss": 0.6727, + "step": 334 + }, + { + "epoch": 0.3716544168631258, + "grad_norm": 0.18781127035617828, + "learning_rate": 0.0002921052631578947, + "loss": 0.5784, + "step": 335 + }, + { + "epoch": 0.3727638330328665, + "grad_norm": 0.3620914816856384, + "learning_rate": 0.00029198190789473684, + "loss": 0.8316, + "step": 336 + }, + { + "epoch": 0.3738732492026071, + "grad_norm": 0.1937960535287857, + "learning_rate": 0.00029185855263157894, + "loss": 0.5956, + "step": 337 + }, + { + "epoch": 0.3749826653723478, + "grad_norm": 0.21955540776252747, + "learning_rate": 0.00029173519736842103, + "loss": 0.6847, + "step": 338 + }, + { + "epoch": 0.37609208154208845, + "grad_norm": 0.22091850638389587, + "learning_rate": 0.0002916118421052631, + "loss": 0.7571, + "step": 339 + }, + { + "epoch": 0.37720149771182915, + "grad_norm": 0.17953120172023773, + "learning_rate": 0.0002914884868421052, + "loss": 0.596, + "step": 340 + }, + { + "epoch": 0.37831091388156984, + "grad_norm": 0.2171243131160736, + "learning_rate": 0.0002913651315789473, + "loss": 0.5556, + "step": 341 + }, + { + "epoch": 0.3794203300513105, + "grad_norm": 0.21151772141456604, + "learning_rate": 0.00029124177631578946, + "loss": 0.8455, + "step": 342 + }, + { + "epoch": 0.3805297462210512, + "grad_norm": 0.21299928426742554, + "learning_rate": 0.00029111842105263155, + "loss": 0.4473, + "step": 343 + }, + { + "epoch": 0.3816391623907918, + "grad_norm": 0.20761217176914215, + "learning_rate": 0.00029099506578947364, + "loss": 0.6742, + "step": 344 + }, + { + "epoch": 0.3827485785605325, + "grad_norm": 0.21071919798851013, + "learning_rate": 0.0002908717105263158, + "loss": 0.5404, + "step": 345 + }, + { + "epoch": 0.3838579947302732, + "grad_norm": 0.3248625099658966, + "learning_rate": 0.0002907483552631579, + "loss": 0.7006, + "step": 346 + }, + { + "epoch": 0.38496741090001385, + "grad_norm": 0.3169274628162384, + "learning_rate": 0.000290625, + "loss": 0.5953, + "step": 347 + }, + { + "epoch": 0.38607682706975455, + "grad_norm": 0.16656579077243805, + "learning_rate": 0.00029050164473684207, + "loss": 0.5392, + "step": 348 + }, + { + "epoch": 0.38718624323949524, + "grad_norm": 0.1759122908115387, + "learning_rate": 0.00029037828947368416, + "loss": 0.6298, + "step": 349 + }, + { + "epoch": 0.3882956594092359, + "grad_norm": 0.17183220386505127, + "learning_rate": 0.00029025493421052626, + "loss": 0.49, + "step": 350 + }, + { + "epoch": 0.3894050755789766, + "grad_norm": 0.2180389016866684, + "learning_rate": 0.0002901315789473684, + "loss": 0.4447, + "step": 351 + }, + { + "epoch": 0.3905144917487172, + "grad_norm": 0.3889177143573761, + "learning_rate": 0.0002900082236842105, + "loss": 0.48, + "step": 352 + }, + { + "epoch": 0.3916239079184579, + "grad_norm": 0.22550411522388458, + "learning_rate": 0.00028988486842105264, + "loss": 0.5976, + "step": 353 + }, + { + "epoch": 0.3927333240881986, + "grad_norm": 0.20059050619602203, + "learning_rate": 0.00028976151315789474, + "loss": 0.578, + "step": 354 + }, + { + "epoch": 0.39384274025793925, + "grad_norm": 0.2586449086666107, + "learning_rate": 0.00028963815789473683, + "loss": 0.6453, + "step": 355 + }, + { + "epoch": 0.39495215642767995, + "grad_norm": 0.24946491420269012, + "learning_rate": 0.0002895148026315789, + "loss": 0.649, + "step": 356 + }, + { + "epoch": 0.3960615725974206, + "grad_norm": 0.1763986498117447, + "learning_rate": 0.000289391447368421, + "loss": 0.6183, + "step": 357 + }, + { + "epoch": 0.3971709887671613, + "grad_norm": 0.1732664704322815, + "learning_rate": 0.0002892680921052631, + "loss": 0.8931, + "step": 358 + }, + { + "epoch": 0.398280404936902, + "grad_norm": 0.22204923629760742, + "learning_rate": 0.0002891447368421052, + "loss": 0.5979, + "step": 359 + }, + { + "epoch": 0.3993898211066426, + "grad_norm": 0.1942061334848404, + "learning_rate": 0.00028902138157894735, + "loss": 0.5693, + "step": 360 + }, + { + "epoch": 0.4004992372763833, + "grad_norm": 0.2240975797176361, + "learning_rate": 0.00028889802631578944, + "loss": 0.6672, + "step": 361 + }, + { + "epoch": 0.40160865344612395, + "grad_norm": 0.19991931319236755, + "learning_rate": 0.0002887746710526316, + "loss": 0.7095, + "step": 362 + }, + { + "epoch": 0.40271806961586465, + "grad_norm": 0.1844676285982132, + "learning_rate": 0.0002886513157894737, + "loss": 0.5337, + "step": 363 + }, + { + "epoch": 0.40382748578560534, + "grad_norm": 0.2115306556224823, + "learning_rate": 0.0002885279605263158, + "loss": 0.6032, + "step": 364 + }, + { + "epoch": 0.404936901955346, + "grad_norm": 0.2993430495262146, + "learning_rate": 0.00028840460526315787, + "loss": 0.8077, + "step": 365 + }, + { + "epoch": 0.4060463181250867, + "grad_norm": 0.41001948714256287, + "learning_rate": 0.00028828124999999996, + "loss": 0.7461, + "step": 366 + }, + { + "epoch": 0.4071557342948273, + "grad_norm": 0.1562204509973526, + "learning_rate": 0.00028815789473684206, + "loss": 0.4517, + "step": 367 + }, + { + "epoch": 0.408265150464568, + "grad_norm": 0.27741947770118713, + "learning_rate": 0.0002880345394736842, + "loss": 0.6076, + "step": 368 + }, + { + "epoch": 0.4093745666343087, + "grad_norm": 0.205497607588768, + "learning_rate": 0.0002879111842105263, + "loss": 0.6499, + "step": 369 + }, + { + "epoch": 0.41048398280404935, + "grad_norm": 0.412622332572937, + "learning_rate": 0.0002877878289473684, + "loss": 0.6487, + "step": 370 + }, + { + "epoch": 0.41159339897379005, + "grad_norm": 0.19786472618579865, + "learning_rate": 0.00028766447368421054, + "loss": 0.4202, + "step": 371 + }, + { + "epoch": 0.41270281514353074, + "grad_norm": 0.1875920295715332, + "learning_rate": 0.00028754111842105263, + "loss": 0.5377, + "step": 372 + }, + { + "epoch": 0.4138122313132714, + "grad_norm": 0.22071506083011627, + "learning_rate": 0.0002874177631578947, + "loss": 0.4671, + "step": 373 + }, + { + "epoch": 0.4149216474830121, + "grad_norm": 0.22277134656906128, + "learning_rate": 0.0002872944078947368, + "loss": 0.6436, + "step": 374 + }, + { + "epoch": 0.4160310636527527, + "grad_norm": 0.13663825392723083, + "learning_rate": 0.0002871710526315789, + "loss": 0.3595, + "step": 375 + }, + { + "epoch": 0.4171404798224934, + "grad_norm": 0.25280505418777466, + "learning_rate": 0.000287047697368421, + "loss": 0.7154, + "step": 376 + }, + { + "epoch": 0.4182498959922341, + "grad_norm": 0.2542460262775421, + "learning_rate": 0.00028692434210526315, + "loss": 0.5229, + "step": 377 + }, + { + "epoch": 0.41935931216197475, + "grad_norm": 0.20687228441238403, + "learning_rate": 0.00028680098684210524, + "loss": 0.4229, + "step": 378 + }, + { + "epoch": 0.42046872833171545, + "grad_norm": 0.1827574074268341, + "learning_rate": 0.00028667763157894734, + "loss": 0.504, + "step": 379 + }, + { + "epoch": 0.4215781445014561, + "grad_norm": 0.22924618422985077, + "learning_rate": 0.00028655427631578943, + "loss": 0.4645, + "step": 380 + }, + { + "epoch": 0.4226875606711968, + "grad_norm": 0.1500721126794815, + "learning_rate": 0.0002864309210526316, + "loss": 0.2866, + "step": 381 + }, + { + "epoch": 0.4237969768409375, + "grad_norm": 0.19137370586395264, + "learning_rate": 0.00028630756578947367, + "loss": 0.4739, + "step": 382 + }, + { + "epoch": 0.4249063930106781, + "grad_norm": 0.1940913051366806, + "learning_rate": 0.00028618421052631576, + "loss": 0.6433, + "step": 383 + }, + { + "epoch": 0.4260158091804188, + "grad_norm": 0.17999312281608582, + "learning_rate": 0.00028606085526315786, + "loss": 0.6118, + "step": 384 + }, + { + "epoch": 0.42712522535015945, + "grad_norm": 0.21522557735443115, + "learning_rate": 0.00028593749999999995, + "loss": 0.5608, + "step": 385 + }, + { + "epoch": 0.42823464151990015, + "grad_norm": 0.23753724992275238, + "learning_rate": 0.0002858141447368421, + "loss": 0.4775, + "step": 386 + }, + { + "epoch": 0.42934405768964085, + "grad_norm": 0.28104132413864136, + "learning_rate": 0.0002856907894736842, + "loss": 0.7179, + "step": 387 + }, + { + "epoch": 0.4304534738593815, + "grad_norm": 0.16530390083789825, + "learning_rate": 0.0002855674342105263, + "loss": 0.5765, + "step": 388 + }, + { + "epoch": 0.4315628900291222, + "grad_norm": 0.20358699560165405, + "learning_rate": 0.0002854440789473684, + "loss": 0.6325, + "step": 389 + }, + { + "epoch": 0.4326723061988628, + "grad_norm": 0.17629845440387726, + "learning_rate": 0.0002853207236842105, + "loss": 0.5528, + "step": 390 + }, + { + "epoch": 0.4337817223686035, + "grad_norm": 0.25051596760749817, + "learning_rate": 0.0002851973684210526, + "loss": 0.6712, + "step": 391 + }, + { + "epoch": 0.4348911385383442, + "grad_norm": 0.19358691573143005, + "learning_rate": 0.0002850740131578947, + "loss": 0.5572, + "step": 392 + }, + { + "epoch": 0.43600055470808485, + "grad_norm": 0.13769972324371338, + "learning_rate": 0.0002849506578947368, + "loss": 0.5055, + "step": 393 + }, + { + "epoch": 0.43710997087782555, + "grad_norm": 0.14807964861392975, + "learning_rate": 0.0002848273026315789, + "loss": 0.5963, + "step": 394 + }, + { + "epoch": 0.43821938704756624, + "grad_norm": 0.16840098798274994, + "learning_rate": 0.00028470394736842104, + "loss": 0.4203, + "step": 395 + }, + { + "epoch": 0.4393288032173069, + "grad_norm": 0.16224174201488495, + "learning_rate": 0.00028458059210526314, + "loss": 0.7647, + "step": 396 + }, + { + "epoch": 0.4404382193870476, + "grad_norm": 0.2029973566532135, + "learning_rate": 0.00028445723684210523, + "loss": 0.6427, + "step": 397 + }, + { + "epoch": 0.4415476355567882, + "grad_norm": 0.23025457561016083, + "learning_rate": 0.0002843338815789473, + "loss": 0.5838, + "step": 398 + }, + { + "epoch": 0.4426570517265289, + "grad_norm": 0.2005675584077835, + "learning_rate": 0.0002842105263157894, + "loss": 0.6441, + "step": 399 + }, + { + "epoch": 0.4437664678962696, + "grad_norm": 0.2208050787448883, + "learning_rate": 0.00028408717105263156, + "loss": 0.656, + "step": 400 + }, + { + "epoch": 0.44487588406601025, + "grad_norm": 0.28568586707115173, + "learning_rate": 0.00028396381578947366, + "loss": 0.6623, + "step": 401 + }, + { + "epoch": 0.44598530023575095, + "grad_norm": 0.22206373512744904, + "learning_rate": 0.00028384046052631575, + "loss": 0.5584, + "step": 402 + }, + { + "epoch": 0.4470947164054916, + "grad_norm": 0.20780105888843536, + "learning_rate": 0.0002837171052631579, + "loss": 0.5801, + "step": 403 + }, + { + "epoch": 0.4482041325752323, + "grad_norm": 0.20285826921463013, + "learning_rate": 0.00028359375, + "loss": 0.6751, + "step": 404 + }, + { + "epoch": 0.449313548744973, + "grad_norm": 0.15252311527729034, + "learning_rate": 0.0002834703947368421, + "loss": 1.1206, + "step": 405 + }, + { + "epoch": 0.4504229649147136, + "grad_norm": 0.23378504812717438, + "learning_rate": 0.0002833470394736842, + "loss": 0.6577, + "step": 406 + }, + { + "epoch": 0.4515323810844543, + "grad_norm": 0.1787406951189041, + "learning_rate": 0.00028322368421052627, + "loss": 0.7497, + "step": 407 + }, + { + "epoch": 0.45264179725419496, + "grad_norm": 0.48000597953796387, + "learning_rate": 0.00028310032894736836, + "loss": 0.752, + "step": 408 + }, + { + "epoch": 0.45375121342393565, + "grad_norm": 0.2794741094112396, + "learning_rate": 0.0002829769736842105, + "loss": 0.5731, + "step": 409 + }, + { + "epoch": 0.45486062959367635, + "grad_norm": 0.2653048634529114, + "learning_rate": 0.0002828536184210526, + "loss": 0.4867, + "step": 410 + }, + { + "epoch": 0.455970045763417, + "grad_norm": 0.19287265837192535, + "learning_rate": 0.0002827302631578947, + "loss": 0.643, + "step": 411 + }, + { + "epoch": 0.4570794619331577, + "grad_norm": 0.2332431823015213, + "learning_rate": 0.00028260690789473685, + "loss": 0.4386, + "step": 412 + }, + { + "epoch": 0.4581888781028983, + "grad_norm": 0.21826831996440887, + "learning_rate": 0.00028248355263157894, + "loss": 0.548, + "step": 413 + }, + { + "epoch": 0.459298294272639, + "grad_norm": 0.23967108130455017, + "learning_rate": 0.00028236019736842103, + "loss": 0.5739, + "step": 414 + }, + { + "epoch": 0.4604077104423797, + "grad_norm": 0.18406794965267181, + "learning_rate": 0.0002822368421052631, + "loss": 0.4547, + "step": 415 + }, + { + "epoch": 0.46151712661212035, + "grad_norm": 0.18926838040351868, + "learning_rate": 0.0002821134868421052, + "loss": 0.5208, + "step": 416 + }, + { + "epoch": 0.46262654278186105, + "grad_norm": 0.16055501997470856, + "learning_rate": 0.0002819901315789473, + "loss": 0.4273, + "step": 417 + }, + { + "epoch": 0.46373595895160175, + "grad_norm": 0.18228095769882202, + "learning_rate": 0.00028186677631578946, + "loss": 0.6194, + "step": 418 + }, + { + "epoch": 0.4648453751213424, + "grad_norm": 0.1525285542011261, + "learning_rate": 0.00028174342105263155, + "loss": 0.5515, + "step": 419 + }, + { + "epoch": 0.4659547912910831, + "grad_norm": 0.24812228977680206, + "learning_rate": 0.00028162006578947365, + "loss": 0.4666, + "step": 420 + }, + { + "epoch": 0.4670642074608237, + "grad_norm": 0.3236676752567291, + "learning_rate": 0.0002814967105263158, + "loss": 0.6294, + "step": 421 + }, + { + "epoch": 0.4681736236305644, + "grad_norm": 0.15236404538154602, + "learning_rate": 0.0002813733552631579, + "loss": 0.4148, + "step": 422 + }, + { + "epoch": 0.4692830398003051, + "grad_norm": 0.20661397278308868, + "learning_rate": 0.00028125, + "loss": 0.4696, + "step": 423 + }, + { + "epoch": 0.47039245597004575, + "grad_norm": 0.3039199113845825, + "learning_rate": 0.00028112664473684207, + "loss": 0.7764, + "step": 424 + }, + { + "epoch": 0.47150187213978645, + "grad_norm": 0.2243487536907196, + "learning_rate": 0.00028100328947368417, + "loss": 0.575, + "step": 425 + }, + { + "epoch": 0.4726112883095271, + "grad_norm": 0.15252256393432617, + "learning_rate": 0.00028087993421052626, + "loss": 0.6139, + "step": 426 + }, + { + "epoch": 0.4737207044792678, + "grad_norm": 0.2288883924484253, + "learning_rate": 0.0002807565789473684, + "loss": 0.5199, + "step": 427 + }, + { + "epoch": 0.4748301206490085, + "grad_norm": 0.1678112894296646, + "learning_rate": 0.0002806332236842105, + "loss": 0.699, + "step": 428 + }, + { + "epoch": 0.4759395368187491, + "grad_norm": 0.27088475227355957, + "learning_rate": 0.00028050986842105265, + "loss": 0.5913, + "step": 429 + }, + { + "epoch": 0.4770489529884898, + "grad_norm": 0.19018007814884186, + "learning_rate": 0.00028038651315789474, + "loss": 0.5341, + "step": 430 + }, + { + "epoch": 0.47815836915823046, + "grad_norm": 0.19086478650569916, + "learning_rate": 0.00028026315789473683, + "loss": 0.565, + "step": 431 + }, + { + "epoch": 0.47926778532797115, + "grad_norm": 0.26616349816322327, + "learning_rate": 0.0002801398026315789, + "loss": 0.7576, + "step": 432 + }, + { + "epoch": 0.48037720149771185, + "grad_norm": 0.16013029217720032, + "learning_rate": 0.000280016447368421, + "loss": 0.5242, + "step": 433 + }, + { + "epoch": 0.4814866176674525, + "grad_norm": 0.16926300525665283, + "learning_rate": 0.0002798930921052631, + "loss": 0.4289, + "step": 434 + }, + { + "epoch": 0.4825960338371932, + "grad_norm": 0.2056371569633484, + "learning_rate": 0.0002797697368421052, + "loss": 0.5971, + "step": 435 + }, + { + "epoch": 0.4837054500069338, + "grad_norm": 0.1635441929101944, + "learning_rate": 0.00027964638157894735, + "loss": 0.4277, + "step": 436 + }, + { + "epoch": 0.4848148661766745, + "grad_norm": 0.24154643714427948, + "learning_rate": 0.00027952302631578945, + "loss": 0.6508, + "step": 437 + }, + { + "epoch": 0.4859242823464152, + "grad_norm": 0.2069096565246582, + "learning_rate": 0.0002793996710526316, + "loss": 0.5105, + "step": 438 + }, + { + "epoch": 0.48703369851615586, + "grad_norm": 0.23238608241081238, + "learning_rate": 0.0002792763157894737, + "loss": 0.7069, + "step": 439 + }, + { + "epoch": 0.48814311468589655, + "grad_norm": 0.18234537541866302, + "learning_rate": 0.0002791529605263158, + "loss": 0.5141, + "step": 440 + }, + { + "epoch": 0.48925253085563725, + "grad_norm": 0.1497894525527954, + "learning_rate": 0.00027902960526315787, + "loss": 0.5206, + "step": 441 + }, + { + "epoch": 0.4903619470253779, + "grad_norm": 0.2433656007051468, + "learning_rate": 0.00027890624999999997, + "loss": 0.5697, + "step": 442 + }, + { + "epoch": 0.4914713631951186, + "grad_norm": 0.1533818542957306, + "learning_rate": 0.00027878289473684206, + "loss": 0.359, + "step": 443 + }, + { + "epoch": 0.4925807793648592, + "grad_norm": 0.2589110732078552, + "learning_rate": 0.00027865953947368415, + "loss": 0.6381, + "step": 444 + }, + { + "epoch": 0.4936901955345999, + "grad_norm": 0.2857501208782196, + "learning_rate": 0.0002785361842105263, + "loss": 0.7064, + "step": 445 + }, + { + "epoch": 0.4947996117043406, + "grad_norm": 0.21398474276065826, + "learning_rate": 0.0002784128289473684, + "loss": 0.5152, + "step": 446 + }, + { + "epoch": 0.49590902787408125, + "grad_norm": 0.23514775931835175, + "learning_rate": 0.00027828947368421054, + "loss": 0.8143, + "step": 447 + }, + { + "epoch": 0.49701844404382195, + "grad_norm": 0.18278779089450836, + "learning_rate": 0.00027816611842105263, + "loss": 0.4275, + "step": 448 + }, + { + "epoch": 0.4981278602135626, + "grad_norm": 0.16910268366336823, + "learning_rate": 0.0002780427631578947, + "loss": 0.5477, + "step": 449 + }, + { + "epoch": 0.4992372763833033, + "grad_norm": 0.17349810898303986, + "learning_rate": 0.0002779194078947368, + "loss": 0.5224, + "step": 450 + }, + { + "epoch": 0.500346692553044, + "grad_norm": 0.2979370653629303, + "learning_rate": 0.0002777960526315789, + "loss": 0.6723, + "step": 451 + }, + { + "epoch": 0.5014561087227847, + "grad_norm": 0.2329479455947876, + "learning_rate": 0.000277672697368421, + "loss": 0.5004, + "step": 452 + }, + { + "epoch": 0.5025655248925253, + "grad_norm": 0.18267230689525604, + "learning_rate": 0.00027754934210526315, + "loss": 0.5729, + "step": 453 + }, + { + "epoch": 0.503674941062266, + "grad_norm": 0.21524755656719208, + "learning_rate": 0.00027742598684210525, + "loss": 0.4295, + "step": 454 + }, + { + "epoch": 0.5047843572320067, + "grad_norm": 0.18233224749565125, + "learning_rate": 0.00027730263157894734, + "loss": 0.4668, + "step": 455 + }, + { + "epoch": 0.5058937734017473, + "grad_norm": 0.19789119064807892, + "learning_rate": 0.00027717927631578943, + "loss": 0.6149, + "step": 456 + }, + { + "epoch": 0.507003189571488, + "grad_norm": 0.16243144869804382, + "learning_rate": 0.0002770559210526316, + "loss": 0.415, + "step": 457 + }, + { + "epoch": 0.5081126057412286, + "grad_norm": 0.22004704177379608, + "learning_rate": 0.0002769325657894737, + "loss": 0.6058, + "step": 458 + }, + { + "epoch": 0.5092220219109693, + "grad_norm": 0.1755845546722412, + "learning_rate": 0.00027680921052631577, + "loss": 0.659, + "step": 459 + }, + { + "epoch": 0.51033143808071, + "grad_norm": 0.16181863844394684, + "learning_rate": 0.00027668585526315786, + "loss": 0.5118, + "step": 460 + }, + { + "epoch": 0.5114408542504507, + "grad_norm": 0.30584779381752014, + "learning_rate": 0.00027656249999999995, + "loss": 0.5185, + "step": 461 + }, + { + "epoch": 0.5125502704201914, + "grad_norm": 0.2442709058523178, + "learning_rate": 0.0002764391447368421, + "loss": 0.6267, + "step": 462 + }, + { + "epoch": 0.513659686589932, + "grad_norm": 0.17913980782032013, + "learning_rate": 0.0002763157894736842, + "loss": 0.4123, + "step": 463 + }, + { + "epoch": 0.5147691027596727, + "grad_norm": 0.16953568160533905, + "learning_rate": 0.0002761924342105263, + "loss": 0.4336, + "step": 464 + }, + { + "epoch": 0.5158785189294134, + "grad_norm": 0.2636931538581848, + "learning_rate": 0.0002760690789473684, + "loss": 0.8255, + "step": 465 + }, + { + "epoch": 0.5169879350991541, + "grad_norm": 0.1953415870666504, + "learning_rate": 0.00027594572368421053, + "loss": 0.3879, + "step": 466 + }, + { + "epoch": 0.5180973512688948, + "grad_norm": 0.23631513118743896, + "learning_rate": 0.0002758223684210526, + "loss": 0.4591, + "step": 467 + }, + { + "epoch": 0.5192067674386354, + "grad_norm": 0.25506916642189026, + "learning_rate": 0.0002756990131578947, + "loss": 0.6347, + "step": 468 + }, + { + "epoch": 0.5203161836083761, + "grad_norm": 0.1907813549041748, + "learning_rate": 0.0002755756578947368, + "loss": 0.6049, + "step": 469 + }, + { + "epoch": 0.5214255997781168, + "grad_norm": 0.26337459683418274, + "learning_rate": 0.0002754523026315789, + "loss": 0.9162, + "step": 470 + }, + { + "epoch": 0.5225350159478575, + "grad_norm": 0.2560301721096039, + "learning_rate": 0.00027532894736842105, + "loss": 0.5243, + "step": 471 + }, + { + "epoch": 0.5236444321175981, + "grad_norm": 0.21078939735889435, + "learning_rate": 0.00027520559210526314, + "loss": 0.528, + "step": 472 + }, + { + "epoch": 0.5247538482873388, + "grad_norm": 0.19924308359622955, + "learning_rate": 0.00027508223684210523, + "loss": 0.484, + "step": 473 + }, + { + "epoch": 0.5258632644570794, + "grad_norm": 0.1312379091978073, + "learning_rate": 0.0002749588815789473, + "loss": 0.5321, + "step": 474 + }, + { + "epoch": 0.5269726806268201, + "grad_norm": 0.32874926924705505, + "learning_rate": 0.0002748355263157894, + "loss": 0.5229, + "step": 475 + }, + { + "epoch": 0.5280820967965608, + "grad_norm": 0.2065068781375885, + "learning_rate": 0.00027471217105263157, + "loss": 0.5401, + "step": 476 + }, + { + "epoch": 0.5291915129663015, + "grad_norm": 0.18827693164348602, + "learning_rate": 0.00027458881578947366, + "loss": 0.3993, + "step": 477 + }, + { + "epoch": 0.5303009291360422, + "grad_norm": 0.20950929820537567, + "learning_rate": 0.00027446546052631575, + "loss": 0.6372, + "step": 478 + }, + { + "epoch": 0.5314103453057828, + "grad_norm": 0.20649929344654083, + "learning_rate": 0.0002743421052631579, + "loss": 0.647, + "step": 479 + }, + { + "epoch": 0.5325197614755235, + "grad_norm": 0.14667537808418274, + "learning_rate": 0.00027421875, + "loss": 0.5186, + "step": 480 + }, + { + "epoch": 0.5336291776452642, + "grad_norm": 0.19212156534194946, + "learning_rate": 0.0002740953947368421, + "loss": 0.8352, + "step": 481 + }, + { + "epoch": 0.5347385938150049, + "grad_norm": 0.2529224753379822, + "learning_rate": 0.0002739720394736842, + "loss": 0.5348, + "step": 482 + }, + { + "epoch": 0.5358480099847456, + "grad_norm": 0.2153153419494629, + "learning_rate": 0.0002738486842105263, + "loss": 0.6096, + "step": 483 + }, + { + "epoch": 0.5369574261544862, + "grad_norm": 0.2908189296722412, + "learning_rate": 0.00027372532894736837, + "loss": 0.6725, + "step": 484 + }, + { + "epoch": 0.5380668423242269, + "grad_norm": 0.2697788178920746, + "learning_rate": 0.0002736019736842105, + "loss": 0.4649, + "step": 485 + }, + { + "epoch": 0.5391762584939676, + "grad_norm": 0.20288242399692535, + "learning_rate": 0.0002734786184210526, + "loss": 0.5319, + "step": 486 + }, + { + "epoch": 0.5402856746637082, + "grad_norm": 0.25883370637893677, + "learning_rate": 0.0002733552631578947, + "loss": 0.3363, + "step": 487 + }, + { + "epoch": 0.5413950908334489, + "grad_norm": 0.15326879918575287, + "learning_rate": 0.00027323190789473685, + "loss": 0.4186, + "step": 488 + }, + { + "epoch": 0.5425045070031895, + "grad_norm": 0.20244112610816956, + "learning_rate": 0.00027310855263157894, + "loss": 0.5039, + "step": 489 + }, + { + "epoch": 0.5436139231729302, + "grad_norm": 0.21093213558197021, + "learning_rate": 0.00027298519736842103, + "loss": 0.7453, + "step": 490 + }, + { + "epoch": 0.5447233393426709, + "grad_norm": 0.22425080835819244, + "learning_rate": 0.00027286184210526313, + "loss": 0.5743, + "step": 491 + }, + { + "epoch": 0.5458327555124116, + "grad_norm": 0.2680664360523224, + "learning_rate": 0.0002727384868421052, + "loss": 0.5446, + "step": 492 + }, + { + "epoch": 0.5469421716821523, + "grad_norm": 0.24040096998214722, + "learning_rate": 0.0002726151315789473, + "loss": 0.4714, + "step": 493 + }, + { + "epoch": 0.5480515878518929, + "grad_norm": 0.30965113639831543, + "learning_rate": 0.0002724917763157894, + "loss": 0.8154, + "step": 494 + }, + { + "epoch": 0.5491610040216336, + "grad_norm": 0.16201867163181305, + "learning_rate": 0.00027236842105263155, + "loss": 0.4898, + "step": 495 + }, + { + "epoch": 0.5502704201913743, + "grad_norm": 0.3444017767906189, + "learning_rate": 0.00027224506578947365, + "loss": 0.5012, + "step": 496 + }, + { + "epoch": 0.551379836361115, + "grad_norm": 0.5116562247276306, + "learning_rate": 0.0002721217105263158, + "loss": 0.4425, + "step": 497 + }, + { + "epoch": 0.5524892525308557, + "grad_norm": 0.17541073262691498, + "learning_rate": 0.0002719983552631579, + "loss": 0.6107, + "step": 498 + }, + { + "epoch": 0.5535986687005963, + "grad_norm": 0.4572921097278595, + "learning_rate": 0.000271875, + "loss": 0.5645, + "step": 499 + }, + { + "epoch": 0.554708084870337, + "grad_norm": 0.35991188883781433, + "learning_rate": 0.0002717516447368421, + "loss": 0.4648, + "step": 500 + }, + { + "epoch": 0.5558175010400777, + "grad_norm": 0.2327331304550171, + "learning_rate": 0.00027162828947368417, + "loss": 0.5978, + "step": 501 + }, + { + "epoch": 0.5569269172098184, + "grad_norm": 0.25607866048812866, + "learning_rate": 0.00027150493421052626, + "loss": 0.7341, + "step": 502 + }, + { + "epoch": 0.558036333379559, + "grad_norm": 0.26063939929008484, + "learning_rate": 0.0002713815789473684, + "loss": 0.6693, + "step": 503 + }, + { + "epoch": 0.5591457495492996, + "grad_norm": 0.23963363468647003, + "learning_rate": 0.0002712582236842105, + "loss": 0.4704, + "step": 504 + }, + { + "epoch": 0.5602551657190403, + "grad_norm": 0.21853481233119965, + "learning_rate": 0.0002711348684210526, + "loss": 0.827, + "step": 505 + }, + { + "epoch": 0.561364581888781, + "grad_norm": 0.2731577754020691, + "learning_rate": 0.00027101151315789474, + "loss": 0.5781, + "step": 506 + }, + { + "epoch": 0.5624739980585217, + "grad_norm": 0.1824404001235962, + "learning_rate": 0.00027088815789473684, + "loss": 0.5114, + "step": 507 + }, + { + "epoch": 0.5635834142282624, + "grad_norm": 0.26666054129600525, + "learning_rate": 0.00027076480263157893, + "loss": 0.5057, + "step": 508 + }, + { + "epoch": 0.564692830398003, + "grad_norm": 0.22783181071281433, + "learning_rate": 0.000270641447368421, + "loss": 0.6204, + "step": 509 + }, + { + "epoch": 0.5658022465677437, + "grad_norm": 0.20822562277317047, + "learning_rate": 0.0002705180921052631, + "loss": 0.3751, + "step": 510 + }, + { + "epoch": 0.5669116627374844, + "grad_norm": 0.1988370418548584, + "learning_rate": 0.0002703947368421052, + "loss": 0.5364, + "step": 511 + }, + { + "epoch": 0.5680210789072251, + "grad_norm": 0.26172971725463867, + "learning_rate": 0.00027027138157894736, + "loss": 0.7982, + "step": 512 + }, + { + "epoch": 0.5691304950769658, + "grad_norm": 0.25788214802742004, + "learning_rate": 0.00027014802631578945, + "loss": 0.5565, + "step": 513 + }, + { + "epoch": 0.5702399112467064, + "grad_norm": 0.23320072889328003, + "learning_rate": 0.0002700246710526316, + "loss": 0.586, + "step": 514 + }, + { + "epoch": 0.5713493274164471, + "grad_norm": 0.1965775191783905, + "learning_rate": 0.0002699013157894737, + "loss": 0.681, + "step": 515 + }, + { + "epoch": 0.5724587435861878, + "grad_norm": 0.4177470803260803, + "learning_rate": 0.0002697779605263158, + "loss": 0.5532, + "step": 516 + }, + { + "epoch": 0.5735681597559285, + "grad_norm": 0.16181616485118866, + "learning_rate": 0.0002696546052631579, + "loss": 0.5, + "step": 517 + }, + { + "epoch": 0.5746775759256691, + "grad_norm": 0.20417065918445587, + "learning_rate": 0.00026953124999999997, + "loss": 0.5589, + "step": 518 + }, + { + "epoch": 0.5757869920954098, + "grad_norm": 0.2022491842508316, + "learning_rate": 0.00026940789473684206, + "loss": 0.5517, + "step": 519 + }, + { + "epoch": 0.5768964082651504, + "grad_norm": 0.3004019558429718, + "learning_rate": 0.00026928453947368415, + "loss": 0.46, + "step": 520 + }, + { + "epoch": 0.5780058244348911, + "grad_norm": 0.2016931027173996, + "learning_rate": 0.0002691611842105263, + "loss": 0.3938, + "step": 521 + }, + { + "epoch": 0.5791152406046318, + "grad_norm": 0.22006861865520477, + "learning_rate": 0.0002690378289473684, + "loss": 0.6507, + "step": 522 + }, + { + "epoch": 0.5802246567743725, + "grad_norm": 0.2743866741657257, + "learning_rate": 0.00026891447368421054, + "loss": 0.6233, + "step": 523 + }, + { + "epoch": 0.5813340729441132, + "grad_norm": 0.2324676811695099, + "learning_rate": 0.00026879111842105264, + "loss": 0.7202, + "step": 524 + }, + { + "epoch": 0.5824434891138538, + "grad_norm": 0.2942185401916504, + "learning_rate": 0.00026866776315789473, + "loss": 0.577, + "step": 525 + }, + { + "epoch": 0.5835529052835945, + "grad_norm": 0.20303772389888763, + "learning_rate": 0.0002685444078947368, + "loss": 0.5867, + "step": 526 + }, + { + "epoch": 0.5846623214533352, + "grad_norm": 0.3175172507762909, + "learning_rate": 0.0002684210526315789, + "loss": 0.5936, + "step": 527 + }, + { + "epoch": 0.5857717376230759, + "grad_norm": 0.26434624195098877, + "learning_rate": 0.000268297697368421, + "loss": 0.8496, + "step": 528 + }, + { + "epoch": 0.5868811537928166, + "grad_norm": 0.20476919412612915, + "learning_rate": 0.00026817434210526316, + "loss": 0.5323, + "step": 529 + }, + { + "epoch": 0.5879905699625572, + "grad_norm": 0.17890197038650513, + "learning_rate": 0.00026805098684210525, + "loss": 0.618, + "step": 530 + }, + { + "epoch": 0.5890999861322979, + "grad_norm": 0.12501509487628937, + "learning_rate": 0.00026792763157894734, + "loss": 0.3985, + "step": 531 + }, + { + "epoch": 0.5902094023020386, + "grad_norm": 0.14128711819648743, + "learning_rate": 0.00026780427631578944, + "loss": 0.3521, + "step": 532 + }, + { + "epoch": 0.5913188184717793, + "grad_norm": 0.24814924597740173, + "learning_rate": 0.0002676809210526316, + "loss": 0.5524, + "step": 533 + }, + { + "epoch": 0.59242823464152, + "grad_norm": 0.1946108043193817, + "learning_rate": 0.0002675575657894737, + "loss": 0.6405, + "step": 534 + }, + { + "epoch": 0.5935376508112605, + "grad_norm": 0.20254820585250854, + "learning_rate": 0.00026743421052631577, + "loss": 0.5621, + "step": 535 + }, + { + "epoch": 0.5946470669810012, + "grad_norm": 0.19255991280078888, + "learning_rate": 0.00026731085526315786, + "loss": 0.7144, + "step": 536 + }, + { + "epoch": 0.5957564831507419, + "grad_norm": 0.20394358038902283, + "learning_rate": 0.00026718749999999996, + "loss": 0.4172, + "step": 537 + }, + { + "epoch": 0.5968658993204826, + "grad_norm": 0.2700938582420349, + "learning_rate": 0.0002670641447368421, + "loss": 0.6688, + "step": 538 + }, + { + "epoch": 0.5979753154902233, + "grad_norm": 0.3247049152851105, + "learning_rate": 0.0002669407894736842, + "loss": 0.632, + "step": 539 + }, + { + "epoch": 0.5990847316599639, + "grad_norm": 0.2315102368593216, + "learning_rate": 0.0002668174342105263, + "loss": 0.5881, + "step": 540 + }, + { + "epoch": 0.6001941478297046, + "grad_norm": 0.28765103220939636, + "learning_rate": 0.0002666940789473684, + "loss": 0.6717, + "step": 541 + }, + { + "epoch": 0.6013035639994453, + "grad_norm": 0.24762357771396637, + "learning_rate": 0.00026657072368421053, + "loss": 0.692, + "step": 542 + }, + { + "epoch": 0.602412980169186, + "grad_norm": 0.25794705748558044, + "learning_rate": 0.0002664473684210526, + "loss": 0.5335, + "step": 543 + }, + { + "epoch": 0.6035223963389267, + "grad_norm": 0.2661387622356415, + "learning_rate": 0.0002663240131578947, + "loss": 0.7539, + "step": 544 + }, + { + "epoch": 0.6046318125086673, + "grad_norm": 0.2539893388748169, + "learning_rate": 0.0002662006578947368, + "loss": 0.4355, + "step": 545 + }, + { + "epoch": 0.605741228678408, + "grad_norm": 0.23424486815929413, + "learning_rate": 0.0002660773026315789, + "loss": 0.5143, + "step": 546 + }, + { + "epoch": 0.6068506448481487, + "grad_norm": 0.1962471306324005, + "learning_rate": 0.00026595394736842105, + "loss": 0.4722, + "step": 547 + }, + { + "epoch": 0.6079600610178894, + "grad_norm": 0.1904420107603073, + "learning_rate": 0.00026583059210526314, + "loss": 0.5893, + "step": 548 + }, + { + "epoch": 0.60906947718763, + "grad_norm": 0.3046864867210388, + "learning_rate": 0.00026570723684210524, + "loss": 0.8071, + "step": 549 + }, + { + "epoch": 0.6101788933573706, + "grad_norm": 0.1840696930885315, + "learning_rate": 0.00026558388157894733, + "loss": 0.4925, + "step": 550 + }, + { + "epoch": 0.6112883095271113, + "grad_norm": 0.33538711071014404, + "learning_rate": 0.0002654605263157894, + "loss": 0.6222, + "step": 551 + }, + { + "epoch": 0.612397725696852, + "grad_norm": 0.22219829261302948, + "learning_rate": 0.00026533717105263157, + "loss": 0.4808, + "step": 552 + }, + { + "epoch": 0.6135071418665927, + "grad_norm": 0.3794260621070862, + "learning_rate": 0.00026521381578947366, + "loss": 0.4464, + "step": 553 + }, + { + "epoch": 0.6146165580363334, + "grad_norm": 0.21123401820659637, + "learning_rate": 0.00026509046052631576, + "loss": 0.6366, + "step": 554 + }, + { + "epoch": 0.615725974206074, + "grad_norm": 0.19497540593147278, + "learning_rate": 0.00026496710526315785, + "loss": 0.5467, + "step": 555 + }, + { + "epoch": 0.6168353903758147, + "grad_norm": 0.18902145326137543, + "learning_rate": 0.00026484375, + "loss": 0.4334, + "step": 556 + }, + { + "epoch": 0.6179448065455554, + "grad_norm": 0.24114537239074707, + "learning_rate": 0.0002647203947368421, + "loss": 0.5268, + "step": 557 + }, + { + "epoch": 0.6190542227152961, + "grad_norm": 0.18477365374565125, + "learning_rate": 0.0002645970394736842, + "loss": 0.6423, + "step": 558 + }, + { + "epoch": 0.6201636388850368, + "grad_norm": 0.21324200928211212, + "learning_rate": 0.0002644736842105263, + "loss": 0.6366, + "step": 559 + }, + { + "epoch": 0.6212730550547775, + "grad_norm": 0.38751551508903503, + "learning_rate": 0.00026435032894736837, + "loss": 0.6193, + "step": 560 + }, + { + "epoch": 0.6223824712245181, + "grad_norm": 0.22451990842819214, + "learning_rate": 0.0002642269736842105, + "loss": 0.6747, + "step": 561 + }, + { + "epoch": 0.6234918873942588, + "grad_norm": 0.2260679006576538, + "learning_rate": 0.0002641036184210526, + "loss": 0.8031, + "step": 562 + }, + { + "epoch": 0.6246013035639995, + "grad_norm": 0.22427742183208466, + "learning_rate": 0.0002639802631578947, + "loss": 0.7171, + "step": 563 + }, + { + "epoch": 0.6257107197337402, + "grad_norm": 0.20108933746814728, + "learning_rate": 0.00026385690789473685, + "loss": 0.4898, + "step": 564 + }, + { + "epoch": 0.6268201359034808, + "grad_norm": 0.3800278604030609, + "learning_rate": 0.00026373355263157894, + "loss": 0.6466, + "step": 565 + }, + { + "epoch": 0.6279295520732214, + "grad_norm": 0.22784464061260223, + "learning_rate": 0.00026361019736842104, + "loss": 0.5532, + "step": 566 + }, + { + "epoch": 0.6290389682429621, + "grad_norm": 0.23498325049877167, + "learning_rate": 0.00026348684210526313, + "loss": 0.8687, + "step": 567 + }, + { + "epoch": 0.6301483844127028, + "grad_norm": 0.1839025616645813, + "learning_rate": 0.0002633634868421052, + "loss": 0.6382, + "step": 568 + }, + { + "epoch": 0.6312578005824435, + "grad_norm": 0.22980616986751556, + "learning_rate": 0.0002632401315789473, + "loss": 0.6109, + "step": 569 + }, + { + "epoch": 0.6323672167521842, + "grad_norm": 0.17458495497703552, + "learning_rate": 0.0002631167763157894, + "loss": 0.5498, + "step": 570 + }, + { + "epoch": 0.6334766329219248, + "grad_norm": 0.22085556387901306, + "learning_rate": 0.00026299342105263156, + "loss": 0.5102, + "step": 571 + }, + { + "epoch": 0.6345860490916655, + "grad_norm": 0.3213456869125366, + "learning_rate": 0.00026287006578947365, + "loss": 0.7377, + "step": 572 + }, + { + "epoch": 0.6356954652614062, + "grad_norm": 0.2649673819541931, + "learning_rate": 0.0002627467105263158, + "loss": 0.6664, + "step": 573 + }, + { + "epoch": 0.6368048814311469, + "grad_norm": 0.3784686326980591, + "learning_rate": 0.0002626233552631579, + "loss": 0.9214, + "step": 574 + }, + { + "epoch": 0.6379142976008876, + "grad_norm": 0.1708430051803589, + "learning_rate": 0.0002625, + "loss": 0.4383, + "step": 575 + }, + { + "epoch": 0.6390237137706282, + "grad_norm": 0.26163679361343384, + "learning_rate": 0.0002623766447368421, + "loss": 0.8272, + "step": 576 + }, + { + "epoch": 0.6401331299403689, + "grad_norm": 0.16542355716228485, + "learning_rate": 0.00026225328947368417, + "loss": 0.4739, + "step": 577 + }, + { + "epoch": 0.6412425461101096, + "grad_norm": 0.2180267572402954, + "learning_rate": 0.00026212993421052626, + "loss": 0.6221, + "step": 578 + }, + { + "epoch": 0.6423519622798503, + "grad_norm": 0.21957628428936005, + "learning_rate": 0.0002620065789473684, + "loss": 0.6062, + "step": 579 + }, + { + "epoch": 0.643461378449591, + "grad_norm": 0.20948325097560883, + "learning_rate": 0.0002618832236842105, + "loss": 0.5542, + "step": 580 + }, + { + "epoch": 0.6445707946193315, + "grad_norm": 0.28024452924728394, + "learning_rate": 0.0002617598684210526, + "loss": 0.718, + "step": 581 + }, + { + "epoch": 0.6456802107890722, + "grad_norm": 0.1376865804195404, + "learning_rate": 0.00026163651315789474, + "loss": 0.4168, + "step": 582 + }, + { + "epoch": 0.6467896269588129, + "grad_norm": 0.15050861239433289, + "learning_rate": 0.00026151315789473684, + "loss": 0.5308, + "step": 583 + }, + { + "epoch": 0.6478990431285536, + "grad_norm": 0.21709243953227997, + "learning_rate": 0.00026138980263157893, + "loss": 0.677, + "step": 584 + }, + { + "epoch": 0.6490084592982943, + "grad_norm": 0.37020203471183777, + "learning_rate": 0.000261266447368421, + "loss": 0.6208, + "step": 585 + }, + { + "epoch": 0.6501178754680349, + "grad_norm": 0.2256883978843689, + "learning_rate": 0.0002611430921052631, + "loss": 0.7711, + "step": 586 + }, + { + "epoch": 0.6512272916377756, + "grad_norm": 0.22099201381206512, + "learning_rate": 0.0002610197368421052, + "loss": 0.6091, + "step": 587 + }, + { + "epoch": 0.6523367078075163, + "grad_norm": 0.23496872186660767, + "learning_rate": 0.00026089638157894736, + "loss": 0.2471, + "step": 588 + }, + { + "epoch": 0.653446123977257, + "grad_norm": 0.2800827622413635, + "learning_rate": 0.00026077302631578945, + "loss": 0.5343, + "step": 589 + }, + { + "epoch": 0.6545555401469977, + "grad_norm": 0.22502388060092926, + "learning_rate": 0.00026064967105263154, + "loss": 0.5117, + "step": 590 + }, + { + "epoch": 0.6556649563167383, + "grad_norm": 0.1460188329219818, + "learning_rate": 0.0002605263157894737, + "loss": 0.4772, + "step": 591 + }, + { + "epoch": 0.656774372486479, + "grad_norm": 0.16591776907444, + "learning_rate": 0.0002604029605263158, + "loss": 0.6087, + "step": 592 + }, + { + "epoch": 0.6578837886562197, + "grad_norm": 0.13937248289585114, + "learning_rate": 0.0002602796052631579, + "loss": 0.5587, + "step": 593 + }, + { + "epoch": 0.6589932048259604, + "grad_norm": 0.2956066131591797, + "learning_rate": 0.00026015624999999997, + "loss": 0.7117, + "step": 594 + }, + { + "epoch": 0.660102620995701, + "grad_norm": 0.495911568403244, + "learning_rate": 0.00026003289473684206, + "loss": 0.8364, + "step": 595 + }, + { + "epoch": 0.6612120371654416, + "grad_norm": 0.1725756675004959, + "learning_rate": 0.00025990953947368416, + "loss": 0.4979, + "step": 596 + }, + { + "epoch": 0.6623214533351823, + "grad_norm": 0.25791987776756287, + "learning_rate": 0.0002597861842105263, + "loss": 0.7304, + "step": 597 + }, + { + "epoch": 0.663430869504923, + "grad_norm": 0.18017613887786865, + "learning_rate": 0.0002596628289473684, + "loss": 0.5706, + "step": 598 + }, + { + "epoch": 0.6645402856746637, + "grad_norm": 0.25128671526908875, + "learning_rate": 0.00025953947368421055, + "loss": 0.5846, + "step": 599 + }, + { + "epoch": 0.6656497018444044, + "grad_norm": 0.34931543469429016, + "learning_rate": 0.00025941611842105264, + "loss": 0.7017, + "step": 600 + }, + { + "epoch": 0.666759118014145, + "grad_norm": 0.25030258297920227, + "learning_rate": 0.00025929276315789473, + "loss": 0.5193, + "step": 601 + }, + { + "epoch": 0.6678685341838857, + "grad_norm": 0.236861452460289, + "learning_rate": 0.0002591694078947368, + "loss": 0.6901, + "step": 602 + }, + { + "epoch": 0.6689779503536264, + "grad_norm": 0.308292418718338, + "learning_rate": 0.0002590460526315789, + "loss": 0.4285, + "step": 603 + }, + { + "epoch": 0.6700873665233671, + "grad_norm": 0.2141687422990799, + "learning_rate": 0.000258922697368421, + "loss": 0.3857, + "step": 604 + }, + { + "epoch": 0.6711967826931078, + "grad_norm": 0.164393350481987, + "learning_rate": 0.0002587993421052631, + "loss": 0.5326, + "step": 605 + }, + { + "epoch": 0.6723061988628485, + "grad_norm": 0.30191662907600403, + "learning_rate": 0.00025867598684210525, + "loss": 0.4978, + "step": 606 + }, + { + "epoch": 0.6734156150325891, + "grad_norm": 0.2955259382724762, + "learning_rate": 0.00025855263157894734, + "loss": 0.5253, + "step": 607 + }, + { + "epoch": 0.6745250312023298, + "grad_norm": 0.22022663056850433, + "learning_rate": 0.00025842927631578944, + "loss": 0.7104, + "step": 608 + }, + { + "epoch": 0.6756344473720705, + "grad_norm": 0.21236523985862732, + "learning_rate": 0.0002583059210526316, + "loss": 0.5324, + "step": 609 + }, + { + "epoch": 0.6767438635418112, + "grad_norm": 0.1945660263299942, + "learning_rate": 0.0002581825657894737, + "loss": 0.5858, + "step": 610 + }, + { + "epoch": 0.6778532797115518, + "grad_norm": 0.31970614194869995, + "learning_rate": 0.00025805921052631577, + "loss": 0.7485, + "step": 611 + }, + { + "epoch": 0.6789626958812924, + "grad_norm": 0.3211202621459961, + "learning_rate": 0.00025793585526315786, + "loss": 0.5554, + "step": 612 + }, + { + "epoch": 0.6800721120510331, + "grad_norm": 0.17990931868553162, + "learning_rate": 0.00025781249999999996, + "loss": 0.6221, + "step": 613 + }, + { + "epoch": 0.6811815282207738, + "grad_norm": 0.3069283366203308, + "learning_rate": 0.0002576891447368421, + "loss": 0.7163, + "step": 614 + }, + { + "epoch": 0.6822909443905145, + "grad_norm": 0.19691799581050873, + "learning_rate": 0.0002575657894736842, + "loss": 0.4445, + "step": 615 + }, + { + "epoch": 0.6834003605602552, + "grad_norm": 0.18806682527065277, + "learning_rate": 0.0002574424342105263, + "loss": 0.5781, + "step": 616 + }, + { + "epoch": 0.6845097767299958, + "grad_norm": 0.24056103825569153, + "learning_rate": 0.0002573190789473684, + "loss": 0.5351, + "step": 617 + }, + { + "epoch": 0.6856191928997365, + "grad_norm": 0.2140192836523056, + "learning_rate": 0.00025719572368421053, + "loss": 0.6222, + "step": 618 + }, + { + "epoch": 0.6867286090694772, + "grad_norm": 0.227885901927948, + "learning_rate": 0.0002570723684210526, + "loss": 0.6394, + "step": 619 + }, + { + "epoch": 0.6878380252392179, + "grad_norm": 0.37848934531211853, + "learning_rate": 0.0002569490131578947, + "loss": 0.6231, + "step": 620 + }, + { + "epoch": 0.6889474414089586, + "grad_norm": 0.290159672498703, + "learning_rate": 0.0002568256578947368, + "loss": 0.4614, + "step": 621 + }, + { + "epoch": 0.6900568575786992, + "grad_norm": 0.18309064209461212, + "learning_rate": 0.0002567023026315789, + "loss": 0.7418, + "step": 622 + }, + { + "epoch": 0.6911662737484399, + "grad_norm": 0.20930887758731842, + "learning_rate": 0.00025657894736842105, + "loss": 0.6572, + "step": 623 + }, + { + "epoch": 0.6922756899181806, + "grad_norm": 0.24094976484775543, + "learning_rate": 0.00025645559210526315, + "loss": 0.5702, + "step": 624 + }, + { + "epoch": 0.6933851060879213, + "grad_norm": 0.22989119589328766, + "learning_rate": 0.00025633223684210524, + "loss": 0.4517, + "step": 625 + }, + { + "epoch": 0.694494522257662, + "grad_norm": 0.2922836244106293, + "learning_rate": 0.00025620888157894733, + "loss": 0.6436, + "step": 626 + }, + { + "epoch": 0.6956039384274025, + "grad_norm": 0.2564910650253296, + "learning_rate": 0.0002560855263157894, + "loss": 0.7321, + "step": 627 + }, + { + "epoch": 0.6967133545971432, + "grad_norm": 0.26571327447891235, + "learning_rate": 0.00025596217105263157, + "loss": 0.4964, + "step": 628 + }, + { + "epoch": 0.6978227707668839, + "grad_norm": 0.5190631151199341, + "learning_rate": 0.00025583881578947367, + "loss": 0.7778, + "step": 629 + }, + { + "epoch": 0.6989321869366246, + "grad_norm": 0.17522084712982178, + "learning_rate": 0.00025571546052631576, + "loss": 0.4768, + "step": 630 + }, + { + "epoch": 0.7000416031063653, + "grad_norm": 0.2567191421985626, + "learning_rate": 0.00025559210526315785, + "loss": 0.5936, + "step": 631 + }, + { + "epoch": 0.7011510192761059, + "grad_norm": 0.46300792694091797, + "learning_rate": 0.00025546875, + "loss": 0.6074, + "step": 632 + }, + { + "epoch": 0.7022604354458466, + "grad_norm": 0.1528376042842865, + "learning_rate": 0.0002553453947368421, + "loss": 0.4779, + "step": 633 + }, + { + "epoch": 0.7033698516155873, + "grad_norm": 0.3135516941547394, + "learning_rate": 0.0002552220394736842, + "loss": 0.8674, + "step": 634 + }, + { + "epoch": 0.704479267785328, + "grad_norm": 0.22676752507686615, + "learning_rate": 0.0002550986842105263, + "loss": 0.5182, + "step": 635 + }, + { + "epoch": 0.7055886839550687, + "grad_norm": 0.21783347427845, + "learning_rate": 0.00025497532894736837, + "loss": 0.4951, + "step": 636 + }, + { + "epoch": 0.7066981001248093, + "grad_norm": 0.2860846221446991, + "learning_rate": 0.0002548519736842105, + "loss": 0.4363, + "step": 637 + }, + { + "epoch": 0.70780751629455, + "grad_norm": 0.281086266040802, + "learning_rate": 0.0002547286184210526, + "loss": 0.5309, + "step": 638 + }, + { + "epoch": 0.7089169324642907, + "grad_norm": 0.30188825726509094, + "learning_rate": 0.0002546052631578947, + "loss": 0.5422, + "step": 639 + }, + { + "epoch": 0.7100263486340314, + "grad_norm": 0.26086941361427307, + "learning_rate": 0.0002544819078947368, + "loss": 0.4605, + "step": 640 + }, + { + "epoch": 0.711135764803772, + "grad_norm": 0.3494928777217865, + "learning_rate": 0.00025435855263157895, + "loss": 0.4638, + "step": 641 + }, + { + "epoch": 0.7122451809735126, + "grad_norm": 0.282701313495636, + "learning_rate": 0.00025423519736842104, + "loss": 0.6283, + "step": 642 + }, + { + "epoch": 0.7133545971432533, + "grad_norm": 0.16015778481960297, + "learning_rate": 0.00025411184210526313, + "loss": 0.4514, + "step": 643 + }, + { + "epoch": 0.714464013312994, + "grad_norm": 0.2207580804824829, + "learning_rate": 0.0002539884868421052, + "loss": 0.7127, + "step": 644 + }, + { + "epoch": 0.7155734294827347, + "grad_norm": 0.2917775511741638, + "learning_rate": 0.0002538651315789473, + "loss": 0.7254, + "step": 645 + }, + { + "epoch": 0.7166828456524754, + "grad_norm": 0.26504382491111755, + "learning_rate": 0.0002537417763157894, + "loss": 0.5335, + "step": 646 + }, + { + "epoch": 0.717792261822216, + "grad_norm": 0.3495447635650635, + "learning_rate": 0.00025361842105263156, + "loss": 0.8425, + "step": 647 + }, + { + "epoch": 0.7189016779919567, + "grad_norm": 0.18636609613895416, + "learning_rate": 0.00025349506578947365, + "loss": 0.5437, + "step": 648 + }, + { + "epoch": 0.7200110941616974, + "grad_norm": 0.18877021968364716, + "learning_rate": 0.0002533717105263158, + "loss": 0.5643, + "step": 649 + }, + { + "epoch": 0.7211205103314381, + "grad_norm": 0.15186652541160583, + "learning_rate": 0.0002532483552631579, + "loss": 0.532, + "step": 650 + }, + { + "epoch": 0.7222299265011788, + "grad_norm": 0.14779016375541687, + "learning_rate": 0.000253125, + "loss": 0.566, + "step": 651 + }, + { + "epoch": 0.7233393426709195, + "grad_norm": 0.19088061153888702, + "learning_rate": 0.0002530016447368421, + "loss": 0.5351, + "step": 652 + }, + { + "epoch": 0.7244487588406601, + "grad_norm": 0.17555399239063263, + "learning_rate": 0.00025287828947368417, + "loss": 0.4758, + "step": 653 + }, + { + "epoch": 0.7255581750104008, + "grad_norm": 0.2535383999347687, + "learning_rate": 0.00025275493421052627, + "loss": 0.6948, + "step": 654 + }, + { + "epoch": 0.7266675911801415, + "grad_norm": 0.2938152551651001, + "learning_rate": 0.00025263157894736836, + "loss": 0.626, + "step": 655 + }, + { + "epoch": 0.7277770073498822, + "grad_norm": 0.2159254252910614, + "learning_rate": 0.0002525082236842105, + "loss": 0.471, + "step": 656 + }, + { + "epoch": 0.7288864235196229, + "grad_norm": 0.20253120362758636, + "learning_rate": 0.0002523848684210526, + "loss": 0.4693, + "step": 657 + }, + { + "epoch": 0.7299958396893634, + "grad_norm": 0.49963316321372986, + "learning_rate": 0.00025226151315789475, + "loss": 0.5185, + "step": 658 + }, + { + "epoch": 0.7311052558591041, + "grad_norm": 0.2259654551744461, + "learning_rate": 0.00025213815789473684, + "loss": 0.7249, + "step": 659 + }, + { + "epoch": 0.7322146720288448, + "grad_norm": 0.21344606578350067, + "learning_rate": 0.00025201480263157893, + "loss": 0.4977, + "step": 660 + }, + { + "epoch": 0.7333240881985855, + "grad_norm": 0.2689608037471771, + "learning_rate": 0.000251891447368421, + "loss": 0.3908, + "step": 661 + }, + { + "epoch": 0.7344335043683262, + "grad_norm": 0.18120594322681427, + "learning_rate": 0.0002517680921052631, + "loss": 0.4518, + "step": 662 + }, + { + "epoch": 0.7355429205380668, + "grad_norm": 0.3393332064151764, + "learning_rate": 0.0002516447368421052, + "loss": 0.6439, + "step": 663 + }, + { + "epoch": 0.7366523367078075, + "grad_norm": 0.21560847759246826, + "learning_rate": 0.00025152138157894736, + "loss": 0.6041, + "step": 664 + }, + { + "epoch": 0.7377617528775482, + "grad_norm": 0.4047819972038269, + "learning_rate": 0.00025139802631578945, + "loss": 0.7444, + "step": 665 + }, + { + "epoch": 0.7388711690472889, + "grad_norm": 0.17038540542125702, + "learning_rate": 0.00025127467105263155, + "loss": 0.4562, + "step": 666 + }, + { + "epoch": 0.7399805852170296, + "grad_norm": 0.1971050649881363, + "learning_rate": 0.0002511513157894737, + "loss": 0.4497, + "step": 667 + }, + { + "epoch": 0.7410900013867702, + "grad_norm": 0.233141228556633, + "learning_rate": 0.0002510279605263158, + "loss": 0.6446, + "step": 668 + }, + { + "epoch": 0.7421994175565109, + "grad_norm": 0.22134460508823395, + "learning_rate": 0.0002509046052631579, + "loss": 0.4393, + "step": 669 + }, + { + "epoch": 0.7433088337262516, + "grad_norm": 0.18341104686260223, + "learning_rate": 0.00025078125, + "loss": 0.6756, + "step": 670 + }, + { + "epoch": 0.7444182498959923, + "grad_norm": 0.2109827697277069, + "learning_rate": 0.00025065789473684207, + "loss": 0.5686, + "step": 671 + }, + { + "epoch": 0.745527666065733, + "grad_norm": 0.249485045671463, + "learning_rate": 0.00025053453947368416, + "loss": 0.5849, + "step": 672 + }, + { + "epoch": 0.7466370822354735, + "grad_norm": 0.36929988861083984, + "learning_rate": 0.0002504111842105263, + "loss": 0.5873, + "step": 673 + }, + { + "epoch": 0.7477464984052142, + "grad_norm": 0.2066950798034668, + "learning_rate": 0.0002502878289473684, + "loss": 0.4152, + "step": 674 + }, + { + "epoch": 0.7488559145749549, + "grad_norm": 0.14233893156051636, + "learning_rate": 0.0002501644736842105, + "loss": 0.4972, + "step": 675 + }, + { + "epoch": 0.7499653307446956, + "grad_norm": 0.24324600398540497, + "learning_rate": 0.00025004111842105264, + "loss": 0.4693, + "step": 676 + }, + { + "epoch": 0.7510747469144363, + "grad_norm": 0.18247400224208832, + "learning_rate": 0.00024991776315789473, + "loss": 0.6579, + "step": 677 + }, + { + "epoch": 0.7521841630841769, + "grad_norm": 0.20285794138908386, + "learning_rate": 0.00024979440789473683, + "loss": 0.3657, + "step": 678 + }, + { + "epoch": 0.7532935792539176, + "grad_norm": 0.2386598140001297, + "learning_rate": 0.0002496710526315789, + "loss": 0.5693, + "step": 679 + }, + { + "epoch": 0.7544029954236583, + "grad_norm": 0.28417715430259705, + "learning_rate": 0.000249547697368421, + "loss": 0.5416, + "step": 680 + }, + { + "epoch": 0.755512411593399, + "grad_norm": 0.4812435805797577, + "learning_rate": 0.0002494243421052631, + "loss": 0.6869, + "step": 681 + }, + { + "epoch": 0.7566218277631397, + "grad_norm": 0.1943156123161316, + "learning_rate": 0.00024930098684210525, + "loss": 0.4625, + "step": 682 + }, + { + "epoch": 0.7577312439328803, + "grad_norm": 0.22697441279888153, + "learning_rate": 0.00024917763157894735, + "loss": 0.4708, + "step": 683 + }, + { + "epoch": 0.758840660102621, + "grad_norm": 0.32567355036735535, + "learning_rate": 0.00024905427631578944, + "loss": 0.4607, + "step": 684 + }, + { + "epoch": 0.7599500762723617, + "grad_norm": 0.2505040168762207, + "learning_rate": 0.0002489309210526316, + "loss": 0.5327, + "step": 685 + }, + { + "epoch": 0.7610594924421024, + "grad_norm": 0.19925042986869812, + "learning_rate": 0.0002488075657894737, + "loss": 0.7429, + "step": 686 + }, + { + "epoch": 0.762168908611843, + "grad_norm": 0.19456201791763306, + "learning_rate": 0.0002486842105263158, + "loss": 0.6922, + "step": 687 + }, + { + "epoch": 0.7632783247815836, + "grad_norm": 0.2642272412776947, + "learning_rate": 0.00024856085526315787, + "loss": 0.5104, + "step": 688 + }, + { + "epoch": 0.7643877409513243, + "grad_norm": 0.22339658439159393, + "learning_rate": 0.00024843749999999996, + "loss": 0.5151, + "step": 689 + }, + { + "epoch": 0.765497157121065, + "grad_norm": 0.23145624995231628, + "learning_rate": 0.00024831414473684205, + "loss": 0.4034, + "step": 690 + }, + { + "epoch": 0.7666065732908057, + "grad_norm": 0.20969951152801514, + "learning_rate": 0.0002481907894736842, + "loss": 0.5701, + "step": 691 + }, + { + "epoch": 0.7677159894605464, + "grad_norm": 0.2449328750371933, + "learning_rate": 0.0002480674342105263, + "loss": 0.4631, + "step": 692 + }, + { + "epoch": 0.768825405630287, + "grad_norm": 0.2677520513534546, + "learning_rate": 0.0002479440789473684, + "loss": 0.5099, + "step": 693 + }, + { + "epoch": 0.7699348218000277, + "grad_norm": 0.21626238524913788, + "learning_rate": 0.00024782072368421053, + "loss": 0.5724, + "step": 694 + }, + { + "epoch": 0.7710442379697684, + "grad_norm": 0.2530820071697235, + "learning_rate": 0.00024769736842105263, + "loss": 0.4764, + "step": 695 + }, + { + "epoch": 0.7721536541395091, + "grad_norm": 0.14730204641819, + "learning_rate": 0.0002475740131578947, + "loss": 0.4657, + "step": 696 + }, + { + "epoch": 0.7732630703092498, + "grad_norm": 0.21123374998569489, + "learning_rate": 0.0002474506578947368, + "loss": 0.6438, + "step": 697 + }, + { + "epoch": 0.7743724864789905, + "grad_norm": 0.2234024703502655, + "learning_rate": 0.0002473273026315789, + "loss": 0.7459, + "step": 698 + }, + { + "epoch": 0.7754819026487311, + "grad_norm": 0.25458112359046936, + "learning_rate": 0.00024720394736842105, + "loss": 0.5767, + "step": 699 + }, + { + "epoch": 0.7765913188184718, + "grad_norm": 0.19247955083847046, + "learning_rate": 0.00024708059210526315, + "loss": 0.4738, + "step": 700 + }, + { + "epoch": 0.7777007349882125, + "grad_norm": 0.3082413673400879, + "learning_rate": 0.00024695723684210524, + "loss": 0.5998, + "step": 701 + }, + { + "epoch": 0.7788101511579532, + "grad_norm": 0.21955102682113647, + "learning_rate": 0.00024683388157894733, + "loss": 0.6763, + "step": 702 + }, + { + "epoch": 0.7799195673276939, + "grad_norm": 0.21807517111301422, + "learning_rate": 0.00024671052631578943, + "loss": 0.6072, + "step": 703 + }, + { + "epoch": 0.7810289834974344, + "grad_norm": 0.2064640372991562, + "learning_rate": 0.0002465871710526316, + "loss": 0.5148, + "step": 704 + }, + { + "epoch": 0.7821383996671751, + "grad_norm": 0.19100461900234222, + "learning_rate": 0.00024646381578947367, + "loss": 0.3789, + "step": 705 + }, + { + "epoch": 0.7832478158369158, + "grad_norm": 0.4141659140586853, + "learning_rate": 0.00024634046052631576, + "loss": 0.5838, + "step": 706 + }, + { + "epoch": 0.7843572320066565, + "grad_norm": 0.260050892829895, + "learning_rate": 0.00024621710526315785, + "loss": 0.6696, + "step": 707 + }, + { + "epoch": 0.7854666481763972, + "grad_norm": 0.22340181469917297, + "learning_rate": 0.00024609375, + "loss": 0.7745, + "step": 708 + }, + { + "epoch": 0.7865760643461378, + "grad_norm": 0.23725625872612, + "learning_rate": 0.0002459703947368421, + "loss": 0.5473, + "step": 709 + }, + { + "epoch": 0.7876854805158785, + "grad_norm": 0.2455572783946991, + "learning_rate": 0.0002458470394736842, + "loss": 0.5893, + "step": 710 + }, + { + "epoch": 0.7887948966856192, + "grad_norm": 0.2484428882598877, + "learning_rate": 0.0002457236842105263, + "loss": 0.6448, + "step": 711 + }, + { + "epoch": 0.7899043128553599, + "grad_norm": 0.1755114644765854, + "learning_rate": 0.0002456003289473684, + "loss": 0.5489, + "step": 712 + }, + { + "epoch": 0.7910137290251006, + "grad_norm": 0.18849492073059082, + "learning_rate": 0.0002454769736842105, + "loss": 0.5535, + "step": 713 + }, + { + "epoch": 0.7921231451948412, + "grad_norm": 0.2751491665840149, + "learning_rate": 0.0002453536184210526, + "loss": 0.6029, + "step": 714 + }, + { + "epoch": 0.7932325613645819, + "grad_norm": 0.2292255163192749, + "learning_rate": 0.0002452302631578947, + "loss": 0.452, + "step": 715 + }, + { + "epoch": 0.7943419775343226, + "grad_norm": 0.2448405623435974, + "learning_rate": 0.0002451069078947368, + "loss": 0.6581, + "step": 716 + }, + { + "epoch": 0.7954513937040633, + "grad_norm": 0.22489365935325623, + "learning_rate": 0.00024498355263157895, + "loss": 0.3962, + "step": 717 + }, + { + "epoch": 0.796560809873804, + "grad_norm": 0.31602340936660767, + "learning_rate": 0.00024486019736842104, + "loss": 0.6768, + "step": 718 + }, + { + "epoch": 0.7976702260435445, + "grad_norm": 0.3364429175853729, + "learning_rate": 0.00024473684210526314, + "loss": 0.7009, + "step": 719 + }, + { + "epoch": 0.7987796422132852, + "grad_norm": 0.2554149031639099, + "learning_rate": 0.00024461348684210523, + "loss": 0.7788, + "step": 720 + }, + { + "epoch": 0.7998890583830259, + "grad_norm": 0.3130899667739868, + "learning_rate": 0.0002444901315789473, + "loss": 0.698, + "step": 721 + }, + { + "epoch": 0.8009984745527666, + "grad_norm": 0.17528441548347473, + "learning_rate": 0.0002443667763157894, + "loss": 0.5581, + "step": 722 + }, + { + "epoch": 0.8021078907225073, + "grad_norm": 0.18897710740566254, + "learning_rate": 0.00024424342105263156, + "loss": 0.4447, + "step": 723 + }, + { + "epoch": 0.8032173068922479, + "grad_norm": 0.2809373438358307, + "learning_rate": 0.00024412006578947368, + "loss": 0.4655, + "step": 724 + }, + { + "epoch": 0.8043267230619886, + "grad_norm": 0.16998441517353058, + "learning_rate": 0.00024399671052631578, + "loss": 0.568, + "step": 725 + }, + { + "epoch": 0.8054361392317293, + "grad_norm": 0.21042829751968384, + "learning_rate": 0.00024387335526315787, + "loss": 0.581, + "step": 726 + }, + { + "epoch": 0.80654555540147, + "grad_norm": 0.18260133266448975, + "learning_rate": 0.00024375, + "loss": 0.4479, + "step": 727 + }, + { + "epoch": 0.8076549715712107, + "grad_norm": 0.323022723197937, + "learning_rate": 0.00024362664473684208, + "loss": 0.6448, + "step": 728 + }, + { + "epoch": 0.8087643877409513, + "grad_norm": 0.2532772123813629, + "learning_rate": 0.00024350328947368418, + "loss": 0.6414, + "step": 729 + }, + { + "epoch": 0.809873803910692, + "grad_norm": 0.186369851231575, + "learning_rate": 0.0002433799342105263, + "loss": 0.5935, + "step": 730 + }, + { + "epoch": 0.8109832200804327, + "grad_norm": 0.22981056571006775, + "learning_rate": 0.0002432565789473684, + "loss": 0.4538, + "step": 731 + }, + { + "epoch": 0.8120926362501734, + "grad_norm": 0.2305302619934082, + "learning_rate": 0.0002431332236842105, + "loss": 0.3573, + "step": 732 + }, + { + "epoch": 0.8132020524199141, + "grad_norm": 0.19394156336784363, + "learning_rate": 0.00024300986842105263, + "loss": 0.7218, + "step": 733 + }, + { + "epoch": 0.8143114685896546, + "grad_norm": 0.26196351647377014, + "learning_rate": 0.00024288651315789472, + "loss": 0.5631, + "step": 734 + }, + { + "epoch": 0.8154208847593953, + "grad_norm": 0.21519336104393005, + "learning_rate": 0.00024276315789473682, + "loss": 0.7168, + "step": 735 + }, + { + "epoch": 0.816530300929136, + "grad_norm": 0.14451566338539124, + "learning_rate": 0.00024263980263157894, + "loss": 0.4952, + "step": 736 + }, + { + "epoch": 0.8176397170988767, + "grad_norm": 0.19086521863937378, + "learning_rate": 0.00024251644736842103, + "loss": 0.441, + "step": 737 + }, + { + "epoch": 0.8187491332686174, + "grad_norm": 0.2369484156370163, + "learning_rate": 0.00024239309210526312, + "loss": 0.6517, + "step": 738 + }, + { + "epoch": 0.819858549438358, + "grad_norm": 0.22705571353435516, + "learning_rate": 0.00024226973684210524, + "loss": 0.5155, + "step": 739 + }, + { + "epoch": 0.8209679656080987, + "grad_norm": 0.16014251112937927, + "learning_rate": 0.00024214638157894734, + "loss": 0.6391, + "step": 740 + }, + { + "epoch": 0.8220773817778394, + "grad_norm": 0.24363452196121216, + "learning_rate": 0.00024202302631578943, + "loss": 0.6092, + "step": 741 + }, + { + "epoch": 0.8231867979475801, + "grad_norm": 0.16901741921901703, + "learning_rate": 0.00024189967105263158, + "loss": 0.394, + "step": 742 + }, + { + "epoch": 0.8242962141173208, + "grad_norm": 0.15532980859279633, + "learning_rate": 0.00024177631578947367, + "loss": 0.516, + "step": 743 + }, + { + "epoch": 0.8254056302870615, + "grad_norm": 0.1833130270242691, + "learning_rate": 0.00024165296052631576, + "loss": 0.4056, + "step": 744 + }, + { + "epoch": 0.8265150464568021, + "grad_norm": 0.23910647630691528, + "learning_rate": 0.00024152960526315788, + "loss": 0.5056, + "step": 745 + }, + { + "epoch": 0.8276244626265428, + "grad_norm": 0.4546511769294739, + "learning_rate": 0.00024140624999999998, + "loss": 0.701, + "step": 746 + }, + { + "epoch": 0.8287338787962835, + "grad_norm": 0.20680895447731018, + "learning_rate": 0.00024128289473684207, + "loss": 0.5841, + "step": 747 + }, + { + "epoch": 0.8298432949660242, + "grad_norm": 0.29819783568382263, + "learning_rate": 0.0002411595394736842, + "loss": 0.5503, + "step": 748 + }, + { + "epoch": 0.8309527111357649, + "grad_norm": 0.18847742676734924, + "learning_rate": 0.00024103618421052628, + "loss": 0.5588, + "step": 749 + }, + { + "epoch": 0.8320621273055054, + "grad_norm": 0.2954421937465668, + "learning_rate": 0.00024091282894736838, + "loss": 0.5611, + "step": 750 + }, + { + "epoch": 0.8331715434752461, + "grad_norm": 0.2604624927043915, + "learning_rate": 0.00024078947368421052, + "loss": 0.6853, + "step": 751 + }, + { + "epoch": 0.8342809596449868, + "grad_norm": 0.31594741344451904, + "learning_rate": 0.00024066611842105262, + "loss": 0.383, + "step": 752 + }, + { + "epoch": 0.8353903758147275, + "grad_norm": 0.2093072086572647, + "learning_rate": 0.00024054276315789474, + "loss": 0.6134, + "step": 753 + }, + { + "epoch": 0.8364997919844682, + "grad_norm": 0.22096338868141174, + "learning_rate": 0.00024041940789473683, + "loss": 0.4454, + "step": 754 + }, + { + "epoch": 0.8376092081542088, + "grad_norm": 0.31209510564804077, + "learning_rate": 0.00024029605263157892, + "loss": 0.8408, + "step": 755 + }, + { + "epoch": 0.8387186243239495, + "grad_norm": 0.36502930521965027, + "learning_rate": 0.00024017269736842104, + "loss": 0.5053, + "step": 756 + }, + { + "epoch": 0.8398280404936902, + "grad_norm": 0.24059328436851501, + "learning_rate": 0.00024004934210526314, + "loss": 0.6181, + "step": 757 + }, + { + "epoch": 0.8409374566634309, + "grad_norm": 0.202326238155365, + "learning_rate": 0.00023992598684210523, + "loss": 0.4569, + "step": 758 + }, + { + "epoch": 0.8420468728331716, + "grad_norm": 0.14637093245983124, + "learning_rate": 0.00023980263157894732, + "loss": 0.438, + "step": 759 + }, + { + "epoch": 0.8431562890029122, + "grad_norm": 0.15636491775512695, + "learning_rate": 0.00023967927631578944, + "loss": 0.6549, + "step": 760 + }, + { + "epoch": 0.8442657051726529, + "grad_norm": 0.25059443712234497, + "learning_rate": 0.00023955592105263156, + "loss": 0.5497, + "step": 761 + }, + { + "epoch": 0.8453751213423936, + "grad_norm": 0.1603458672761917, + "learning_rate": 0.00023943256578947368, + "loss": 0.5023, + "step": 762 + }, + { + "epoch": 0.8464845375121343, + "grad_norm": 0.202356219291687, + "learning_rate": 0.00023930921052631578, + "loss": 0.6076, + "step": 763 + }, + { + "epoch": 0.847593953681875, + "grad_norm": 0.2810531258583069, + "learning_rate": 0.00023918585526315787, + "loss": 0.4029, + "step": 764 + }, + { + "epoch": 0.8487033698516155, + "grad_norm": 0.21425937116146088, + "learning_rate": 0.0002390625, + "loss": 0.4594, + "step": 765 + }, + { + "epoch": 0.8498127860213562, + "grad_norm": 0.29210686683654785, + "learning_rate": 0.00023893914473684208, + "loss": 0.4246, + "step": 766 + }, + { + "epoch": 0.8509222021910969, + "grad_norm": 0.483568400144577, + "learning_rate": 0.00023881578947368418, + "loss": 0.6333, + "step": 767 + }, + { + "epoch": 0.8520316183608376, + "grad_norm": 0.22136186063289642, + "learning_rate": 0.0002386924342105263, + "loss": 0.6211, + "step": 768 + }, + { + "epoch": 0.8531410345305783, + "grad_norm": 0.29133930802345276, + "learning_rate": 0.0002385690789473684, + "loss": 0.5317, + "step": 769 + }, + { + "epoch": 0.8542504507003189, + "grad_norm": 0.20380742847919464, + "learning_rate": 0.0002384457236842105, + "loss": 0.6699, + "step": 770 + }, + { + "epoch": 0.8553598668700596, + "grad_norm": 0.16621272265911102, + "learning_rate": 0.00023832236842105263, + "loss": 0.8654, + "step": 771 + }, + { + "epoch": 0.8564692830398003, + "grad_norm": 0.19278384745121002, + "learning_rate": 0.00023819901315789472, + "loss": 0.5077, + "step": 772 + }, + { + "epoch": 0.857578699209541, + "grad_norm": 0.18274825811386108, + "learning_rate": 0.00023807565789473682, + "loss": 0.5807, + "step": 773 + }, + { + "epoch": 0.8586881153792817, + "grad_norm": 0.36240458488464355, + "learning_rate": 0.00023795230263157894, + "loss": 0.5612, + "step": 774 + }, + { + "epoch": 0.8597975315490223, + "grad_norm": 0.2410973459482193, + "learning_rate": 0.00023782894736842103, + "loss": 0.6891, + "step": 775 + }, + { + "epoch": 0.860906947718763, + "grad_norm": 0.2783324718475342, + "learning_rate": 0.00023770559210526312, + "loss": 0.4547, + "step": 776 + }, + { + "epoch": 0.8620163638885037, + "grad_norm": 0.7285773754119873, + "learning_rate": 0.00023758223684210524, + "loss": 0.4894, + "step": 777 + }, + { + "epoch": 0.8631257800582444, + "grad_norm": 0.16931070387363434, + "learning_rate": 0.00023745888157894734, + "loss": 0.5592, + "step": 778 + }, + { + "epoch": 0.8642351962279851, + "grad_norm": 0.24053402245044708, + "learning_rate": 0.00023733552631578943, + "loss": 0.4662, + "step": 779 + }, + { + "epoch": 0.8653446123977256, + "grad_norm": 0.42136144638061523, + "learning_rate": 0.00023721217105263158, + "loss": 0.617, + "step": 780 + }, + { + "epoch": 0.8664540285674663, + "grad_norm": 0.16937246918678284, + "learning_rate": 0.00023708881578947367, + "loss": 0.4531, + "step": 781 + }, + { + "epoch": 0.867563444737207, + "grad_norm": 0.25334227085113525, + "learning_rate": 0.00023696546052631576, + "loss": 0.4013, + "step": 782 + }, + { + "epoch": 0.8686728609069477, + "grad_norm": 0.18785522878170013, + "learning_rate": 0.00023684210526315788, + "loss": 0.5154, + "step": 783 + }, + { + "epoch": 0.8697822770766884, + "grad_norm": 0.21429309248924255, + "learning_rate": 0.00023671874999999998, + "loss": 0.4044, + "step": 784 + }, + { + "epoch": 0.8708916932464291, + "grad_norm": 0.1965511292219162, + "learning_rate": 0.00023659539473684207, + "loss": 0.6695, + "step": 785 + }, + { + "epoch": 0.8720011094161697, + "grad_norm": 0.2309243083000183, + "learning_rate": 0.0002364720394736842, + "loss": 0.5465, + "step": 786 + }, + { + "epoch": 0.8731105255859104, + "grad_norm": 0.2777848243713379, + "learning_rate": 0.00023634868421052628, + "loss": 0.6245, + "step": 787 + }, + { + "epoch": 0.8742199417556511, + "grad_norm": 0.34535712003707886, + "learning_rate": 0.00023622532894736838, + "loss": 0.6035, + "step": 788 + }, + { + "epoch": 0.8753293579253918, + "grad_norm": 0.24912825226783752, + "learning_rate": 0.00023610197368421052, + "loss": 0.5369, + "step": 789 + }, + { + "epoch": 0.8764387740951325, + "grad_norm": 0.23429974913597107, + "learning_rate": 0.00023597861842105262, + "loss": 0.3689, + "step": 790 + }, + { + "epoch": 0.8775481902648731, + "grad_norm": 0.25908464193344116, + "learning_rate": 0.00023585526315789474, + "loss": 0.6301, + "step": 791 + }, + { + "epoch": 0.8786576064346138, + "grad_norm": 0.3178803324699402, + "learning_rate": 0.00023573190789473683, + "loss": 0.707, + "step": 792 + }, + { + "epoch": 0.8797670226043545, + "grad_norm": 0.23064696788787842, + "learning_rate": 0.00023560855263157892, + "loss": 0.5725, + "step": 793 + }, + { + "epoch": 0.8808764387740952, + "grad_norm": 0.2530830502510071, + "learning_rate": 0.00023548519736842102, + "loss": 0.5131, + "step": 794 + }, + { + "epoch": 0.8819858549438359, + "grad_norm": 0.21092426776885986, + "learning_rate": 0.00023536184210526314, + "loss": 0.6452, + "step": 795 + }, + { + "epoch": 0.8830952711135764, + "grad_norm": 0.21597221493721008, + "learning_rate": 0.00023523848684210523, + "loss": 0.5438, + "step": 796 + }, + { + "epoch": 0.8842046872833171, + "grad_norm": 0.21937009692192078, + "learning_rate": 0.00023511513157894732, + "loss": 0.4943, + "step": 797 + }, + { + "epoch": 0.8853141034530578, + "grad_norm": 0.2742394506931305, + "learning_rate": 0.00023499177631578944, + "loss": 0.7567, + "step": 798 + }, + { + "epoch": 0.8864235196227985, + "grad_norm": 0.3151918053627014, + "learning_rate": 0.00023486842105263156, + "loss": 0.7817, + "step": 799 + }, + { + "epoch": 0.8875329357925392, + "grad_norm": 0.23402948677539825, + "learning_rate": 0.00023474506578947368, + "loss": 0.3204, + "step": 800 + }, + { + "epoch": 0.8886423519622798, + "grad_norm": 0.25392022728919983, + "learning_rate": 0.00023462171052631578, + "loss": 0.6914, + "step": 801 + }, + { + "epoch": 0.8897517681320205, + "grad_norm": 0.22734297811985016, + "learning_rate": 0.00023449835526315787, + "loss": 0.5779, + "step": 802 + }, + { + "epoch": 0.8908611843017612, + "grad_norm": 0.2071351408958435, + "learning_rate": 0.000234375, + "loss": 0.6024, + "step": 803 + }, + { + "epoch": 0.8919706004715019, + "grad_norm": 0.2632406949996948, + "learning_rate": 0.00023425164473684208, + "loss": 0.5668, + "step": 804 + }, + { + "epoch": 0.8930800166412426, + "grad_norm": 0.27089810371398926, + "learning_rate": 0.00023412828947368418, + "loss": 0.6043, + "step": 805 + }, + { + "epoch": 0.8941894328109832, + "grad_norm": 0.1918395459651947, + "learning_rate": 0.0002340049342105263, + "loss": 0.3305, + "step": 806 + }, + { + "epoch": 0.8952988489807239, + "grad_norm": 0.183834046125412, + "learning_rate": 0.0002338815789473684, + "loss": 0.603, + "step": 807 + }, + { + "epoch": 0.8964082651504646, + "grad_norm": 0.29509904980659485, + "learning_rate": 0.0002337582236842105, + "loss": 0.6681, + "step": 808 + }, + { + "epoch": 0.8975176813202053, + "grad_norm": 0.24960756301879883, + "learning_rate": 0.00023363486842105263, + "loss": 0.5601, + "step": 809 + }, + { + "epoch": 0.898627097489946, + "grad_norm": 0.5940669775009155, + "learning_rate": 0.00023351151315789472, + "loss": 0.5074, + "step": 810 + }, + { + "epoch": 0.8997365136596865, + "grad_norm": 0.21877194941043854, + "learning_rate": 0.00023338815789473682, + "loss": 0.6044, + "step": 811 + }, + { + "epoch": 0.9008459298294272, + "grad_norm": 0.24642789363861084, + "learning_rate": 0.00023326480263157894, + "loss": 0.5249, + "step": 812 + }, + { + "epoch": 0.9019553459991679, + "grad_norm": 0.21799951791763306, + "learning_rate": 0.00023314144736842103, + "loss": 0.5438, + "step": 813 + }, + { + "epoch": 0.9030647621689086, + "grad_norm": 0.2310633808374405, + "learning_rate": 0.00023301809210526312, + "loss": 0.5451, + "step": 814 + }, + { + "epoch": 0.9041741783386493, + "grad_norm": 0.1848413199186325, + "learning_rate": 0.00023289473684210524, + "loss": 0.4699, + "step": 815 + }, + { + "epoch": 0.9052835945083899, + "grad_norm": 0.282272607088089, + "learning_rate": 0.00023277138157894734, + "loss": 0.5288, + "step": 816 + }, + { + "epoch": 0.9063930106781306, + "grad_norm": 0.19741901755332947, + "learning_rate": 0.00023264802631578943, + "loss": 0.4977, + "step": 817 + }, + { + "epoch": 0.9075024268478713, + "grad_norm": 0.2287929505109787, + "learning_rate": 0.00023252467105263158, + "loss": 0.5804, + "step": 818 + }, + { + "epoch": 0.908611843017612, + "grad_norm": 0.2509765326976776, + "learning_rate": 0.00023240131578947367, + "loss": 0.4381, + "step": 819 + }, + { + "epoch": 0.9097212591873527, + "grad_norm": 0.2717498540878296, + "learning_rate": 0.00023227796052631576, + "loss": 0.6095, + "step": 820 + }, + { + "epoch": 0.9108306753570933, + "grad_norm": 0.38568127155303955, + "learning_rate": 0.00023215460526315789, + "loss": 0.512, + "step": 821 + }, + { + "epoch": 0.911940091526834, + "grad_norm": 0.22532710433006287, + "learning_rate": 0.00023203124999999998, + "loss": 0.6333, + "step": 822 + }, + { + "epoch": 0.9130495076965747, + "grad_norm": 0.2616422176361084, + "learning_rate": 0.00023190789473684207, + "loss": 0.4955, + "step": 823 + }, + { + "epoch": 0.9141589238663154, + "grad_norm": 0.30875271558761597, + "learning_rate": 0.0002317845394736842, + "loss": 0.6645, + "step": 824 + }, + { + "epoch": 0.9152683400360561, + "grad_norm": 0.22438234090805054, + "learning_rate": 0.00023166118421052629, + "loss": 0.6209, + "step": 825 + }, + { + "epoch": 0.9163777562057966, + "grad_norm": 0.30306366086006165, + "learning_rate": 0.00023153782894736838, + "loss": 0.6832, + "step": 826 + }, + { + "epoch": 0.9174871723755373, + "grad_norm": 0.19736707210540771, + "learning_rate": 0.00023141447368421053, + "loss": 0.5076, + "step": 827 + }, + { + "epoch": 0.918596588545278, + "grad_norm": 0.37344890832901, + "learning_rate": 0.00023129111842105262, + "loss": 0.5337, + "step": 828 + }, + { + "epoch": 0.9197060047150187, + "grad_norm": 0.22570458054542542, + "learning_rate": 0.0002311677631578947, + "loss": 0.4477, + "step": 829 + }, + { + "epoch": 0.9208154208847594, + "grad_norm": 0.2158828228712082, + "learning_rate": 0.00023104440789473683, + "loss": 0.5738, + "step": 830 + }, + { + "epoch": 0.9219248370545001, + "grad_norm": 0.17967133224010468, + "learning_rate": 0.00023092105263157893, + "loss": 0.6245, + "step": 831 + }, + { + "epoch": 0.9230342532242407, + "grad_norm": 0.23400144279003143, + "learning_rate": 0.00023079769736842102, + "loss": 0.5328, + "step": 832 + }, + { + "epoch": 0.9241436693939814, + "grad_norm": 0.20684117078781128, + "learning_rate": 0.00023067434210526314, + "loss": 0.4556, + "step": 833 + }, + { + "epoch": 0.9252530855637221, + "grad_norm": 0.17495577037334442, + "learning_rate": 0.00023055098684210523, + "loss": 0.4022, + "step": 834 + }, + { + "epoch": 0.9263625017334628, + "grad_norm": 0.3661905527114868, + "learning_rate": 0.00023042763157894733, + "loss": 0.8714, + "step": 835 + }, + { + "epoch": 0.9274719179032035, + "grad_norm": 0.20915554463863373, + "learning_rate": 0.00023030427631578945, + "loss": 0.5164, + "step": 836 + }, + { + "epoch": 0.9285813340729441, + "grad_norm": 0.2413186877965927, + "learning_rate": 0.00023018092105263157, + "loss": 0.515, + "step": 837 + }, + { + "epoch": 0.9296907502426848, + "grad_norm": 0.23484086990356445, + "learning_rate": 0.00023005756578947369, + "loss": 0.5425, + "step": 838 + }, + { + "epoch": 0.9308001664124255, + "grad_norm": 0.269280344247818, + "learning_rate": 0.00022993421052631578, + "loss": 0.5335, + "step": 839 + }, + { + "epoch": 0.9319095825821662, + "grad_norm": 0.3550933003425598, + "learning_rate": 0.00022981085526315787, + "loss": 0.4588, + "step": 840 + }, + { + "epoch": 0.9330189987519069, + "grad_norm": 0.33913251757621765, + "learning_rate": 0.0002296875, + "loss": 0.6615, + "step": 841 + }, + { + "epoch": 0.9341284149216474, + "grad_norm": 0.31911739706993103, + "learning_rate": 0.00022956414473684209, + "loss": 0.6322, + "step": 842 + }, + { + "epoch": 0.9352378310913881, + "grad_norm": 0.33628326654434204, + "learning_rate": 0.00022944078947368418, + "loss": 0.5983, + "step": 843 + }, + { + "epoch": 0.9363472472611288, + "grad_norm": 0.1979479342699051, + "learning_rate": 0.00022931743421052627, + "loss": 0.4327, + "step": 844 + }, + { + "epoch": 0.9374566634308695, + "grad_norm": 0.23927690088748932, + "learning_rate": 0.0002291940789473684, + "loss": 0.5675, + "step": 845 + }, + { + "epoch": 0.9385660796006102, + "grad_norm": 0.23125141859054565, + "learning_rate": 0.0002290707236842105, + "loss": 0.5135, + "step": 846 + }, + { + "epoch": 0.9396754957703508, + "grad_norm": 0.2613430917263031, + "learning_rate": 0.00022894736842105263, + "loss": 0.4361, + "step": 847 + }, + { + "epoch": 0.9407849119400915, + "grad_norm": 0.28109273314476013, + "learning_rate": 0.00022882401315789473, + "loss": 0.5697, + "step": 848 + }, + { + "epoch": 0.9418943281098322, + "grad_norm": 0.25478866696357727, + "learning_rate": 0.00022870065789473682, + "loss": 0.5453, + "step": 849 + }, + { + "epoch": 0.9430037442795729, + "grad_norm": 0.2179301530122757, + "learning_rate": 0.00022857730263157894, + "loss": 0.5588, + "step": 850 + }, + { + "epoch": 0.9441131604493136, + "grad_norm": 0.20109961926937103, + "learning_rate": 0.00022845394736842103, + "loss": 0.5843, + "step": 851 + }, + { + "epoch": 0.9452225766190542, + "grad_norm": 0.18201051652431488, + "learning_rate": 0.00022833059210526313, + "loss": 0.5031, + "step": 852 + }, + { + "epoch": 0.9463319927887949, + "grad_norm": 0.25168418884277344, + "learning_rate": 0.00022820723684210525, + "loss": 0.7, + "step": 853 + }, + { + "epoch": 0.9474414089585356, + "grad_norm": 0.2502383291721344, + "learning_rate": 0.00022808388157894734, + "loss": 0.6002, + "step": 854 + }, + { + "epoch": 0.9485508251282763, + "grad_norm": 0.2077435404062271, + "learning_rate": 0.00022796052631578943, + "loss": 0.632, + "step": 855 + }, + { + "epoch": 0.949660241298017, + "grad_norm": 0.28856661915779114, + "learning_rate": 0.00022783717105263158, + "loss": 0.5973, + "step": 856 + }, + { + "epoch": 0.9507696574677575, + "grad_norm": 0.18906134366989136, + "learning_rate": 0.00022771381578947367, + "loss": 0.5271, + "step": 857 + }, + { + "epoch": 0.9518790736374982, + "grad_norm": 0.2617158591747284, + "learning_rate": 0.00022759046052631577, + "loss": 0.5909, + "step": 858 + }, + { + "epoch": 0.9529884898072389, + "grad_norm": 0.42665448784828186, + "learning_rate": 0.0002274671052631579, + "loss": 0.5476, + "step": 859 + }, + { + "epoch": 0.9540979059769796, + "grad_norm": 0.2901977598667145, + "learning_rate": 0.00022734374999999998, + "loss": 0.6033, + "step": 860 + }, + { + "epoch": 0.9552073221467203, + "grad_norm": 0.26080241799354553, + "learning_rate": 0.00022722039473684207, + "loss": 0.6149, + "step": 861 + }, + { + "epoch": 0.9563167383164609, + "grad_norm": 0.2692500948905945, + "learning_rate": 0.0002270970394736842, + "loss": 0.6035, + "step": 862 + }, + { + "epoch": 0.9574261544862016, + "grad_norm": 0.17691655457019806, + "learning_rate": 0.00022697368421052629, + "loss": 0.5072, + "step": 863 + }, + { + "epoch": 0.9585355706559423, + "grad_norm": 0.4164085388183594, + "learning_rate": 0.00022685032894736838, + "loss": 0.5342, + "step": 864 + }, + { + "epoch": 0.959644986825683, + "grad_norm": 0.18131154775619507, + "learning_rate": 0.00022672697368421053, + "loss": 0.5121, + "step": 865 + }, + { + "epoch": 0.9607544029954237, + "grad_norm": 0.22192832827568054, + "learning_rate": 0.00022660361842105262, + "loss": 0.5307, + "step": 866 + }, + { + "epoch": 0.9618638191651643, + "grad_norm": 0.350583016872406, + "learning_rate": 0.0002264802631578947, + "loss": 0.4176, + "step": 867 + }, + { + "epoch": 0.962973235334905, + "grad_norm": 0.2533394396305084, + "learning_rate": 0.00022635690789473683, + "loss": 0.5289, + "step": 868 + }, + { + "epoch": 0.9640826515046457, + "grad_norm": 0.19999092817306519, + "learning_rate": 0.00022623355263157893, + "loss": 0.5624, + "step": 869 + }, + { + "epoch": 0.9651920676743864, + "grad_norm": 0.17564158141613007, + "learning_rate": 0.00022611019736842102, + "loss": 0.4738, + "step": 870 + }, + { + "epoch": 0.9663014838441271, + "grad_norm": 0.35863691568374634, + "learning_rate": 0.00022598684210526314, + "loss": 0.7332, + "step": 871 + }, + { + "epoch": 0.9674109000138676, + "grad_norm": 0.20389032363891602, + "learning_rate": 0.00022586348684210523, + "loss": 0.4931, + "step": 872 + }, + { + "epoch": 0.9685203161836083, + "grad_norm": 0.19918256998062134, + "learning_rate": 0.00022574013157894733, + "loss": 0.5961, + "step": 873 + }, + { + "epoch": 0.969629732353349, + "grad_norm": 0.25909268856048584, + "learning_rate": 0.00022561677631578945, + "loss": 0.5575, + "step": 874 + }, + { + "epoch": 0.9707391485230897, + "grad_norm": 0.18549631536006927, + "learning_rate": 0.00022549342105263157, + "loss": 0.5181, + "step": 875 + }, + { + "epoch": 0.9718485646928304, + "grad_norm": 0.2645319998264313, + "learning_rate": 0.0002253700657894737, + "loss": 0.6302, + "step": 876 + }, + { + "epoch": 0.9729579808625711, + "grad_norm": 0.36148592829704285, + "learning_rate": 0.00022524671052631578, + "loss": 0.4178, + "step": 877 + }, + { + "epoch": 0.9740673970323117, + "grad_norm": 0.2523496150970459, + "learning_rate": 0.00022512335526315787, + "loss": 0.5333, + "step": 878 + }, + { + "epoch": 0.9751768132020524, + "grad_norm": 0.20175378024578094, + "learning_rate": 0.000225, + "loss": 0.4555, + "step": 879 + }, + { + "epoch": 0.9762862293717931, + "grad_norm": 0.7862651348114014, + "learning_rate": 0.0002248766447368421, + "loss": 0.492, + "step": 880 + }, + { + "epoch": 0.9773956455415338, + "grad_norm": 0.27353614568710327, + "learning_rate": 0.00022475328947368418, + "loss": 0.6768, + "step": 881 + }, + { + "epoch": 0.9785050617112745, + "grad_norm": 0.27447402477264404, + "learning_rate": 0.00022462993421052627, + "loss": 0.7058, + "step": 882 + }, + { + "epoch": 0.9796144778810151, + "grad_norm": 0.1933760941028595, + "learning_rate": 0.0002245065789473684, + "loss": 0.6976, + "step": 883 + }, + { + "epoch": 0.9807238940507558, + "grad_norm": 0.18439841270446777, + "learning_rate": 0.00022438322368421051, + "loss": 0.403, + "step": 884 + }, + { + "epoch": 0.9818333102204965, + "grad_norm": 0.27743586897850037, + "learning_rate": 0.00022425986842105263, + "loss": 0.7012, + "step": 885 + }, + { + "epoch": 0.9829427263902372, + "grad_norm": 0.26009905338287354, + "learning_rate": 0.00022413651315789473, + "loss": 0.5342, + "step": 886 + }, + { + "epoch": 0.9840521425599779, + "grad_norm": 0.30932557582855225, + "learning_rate": 0.00022401315789473682, + "loss": 0.6027, + "step": 887 + }, + { + "epoch": 0.9851615587297184, + "grad_norm": 0.21047377586364746, + "learning_rate": 0.00022388980263157894, + "loss": 0.4078, + "step": 888 + }, + { + "epoch": 0.9862709748994591, + "grad_norm": 0.17896795272827148, + "learning_rate": 0.00022376644736842103, + "loss": 0.5645, + "step": 889 + }, + { + "epoch": 0.9873803910691998, + "grad_norm": 0.31928160786628723, + "learning_rate": 0.00022364309210526313, + "loss": 0.6017, + "step": 890 + }, + { + "epoch": 0.9884898072389405, + "grad_norm": 0.2661576271057129, + "learning_rate": 0.00022351973684210525, + "loss": 0.4079, + "step": 891 + }, + { + "epoch": 0.9895992234086812, + "grad_norm": 0.17466966807842255, + "learning_rate": 0.00022339638157894734, + "loss": 0.4499, + "step": 892 + }, + { + "epoch": 0.9907086395784218, + "grad_norm": 0.29928284883499146, + "learning_rate": 0.00022327302631578943, + "loss": 0.6738, + "step": 893 + }, + { + "epoch": 0.9918180557481625, + "grad_norm": 0.22055590152740479, + "learning_rate": 0.00022314967105263158, + "loss": 0.4979, + "step": 894 + }, + { + "epoch": 0.9929274719179032, + "grad_norm": 0.20703110098838806, + "learning_rate": 0.00022302631578947367, + "loss": 0.34, + "step": 895 + }, + { + "epoch": 0.9940368880876439, + "grad_norm": 0.23895259201526642, + "learning_rate": 0.00022290296052631577, + "loss": 0.6151, + "step": 896 + }, + { + "epoch": 0.9951463042573846, + "grad_norm": 0.23790138959884644, + "learning_rate": 0.0002227796052631579, + "loss": 0.5713, + "step": 897 + }, + { + "epoch": 0.9962557204271252, + "grad_norm": 0.18377721309661865, + "learning_rate": 0.00022265624999999998, + "loss": 0.5082, + "step": 898 + }, + { + "epoch": 0.9973651365968659, + "grad_norm": 0.23333214223384857, + "learning_rate": 0.00022253289473684207, + "loss": 0.4654, + "step": 899 + }, + { + "epoch": 0.9984745527666066, + "grad_norm": 0.22151075303554535, + "learning_rate": 0.0002224095394736842, + "loss": 0.5104, + "step": 900 + }, + { + "epoch": 0.9995839689363473, + "grad_norm": 0.2722238302230835, + "learning_rate": 0.0002222861842105263, + "loss": 0.5298, + "step": 901 + }, + { + "epoch": 1.000693385106088, + "grad_norm": 0.3227219879627228, + "learning_rate": 0.00022216282894736838, + "loss": 0.6207, + "step": 902 + }, + { + "epoch": 1.0018028012758287, + "grad_norm": 0.21921642124652863, + "learning_rate": 0.00022203947368421053, + "loss": 0.4741, + "step": 903 + }, + { + "epoch": 1.0029122174455694, + "grad_norm": 0.18044739961624146, + "learning_rate": 0.00022191611842105262, + "loss": 0.2975, + "step": 904 + }, + { + "epoch": 1.00402163361531, + "grad_norm": 0.20503199100494385, + "learning_rate": 0.00022179276315789471, + "loss": 0.4205, + "step": 905 + }, + { + "epoch": 1.0051310497850505, + "grad_norm": 0.18091997504234314, + "learning_rate": 0.00022166940789473683, + "loss": 0.4393, + "step": 906 + }, + { + "epoch": 1.0062404659547912, + "grad_norm": 0.2026747614145279, + "learning_rate": 0.00022154605263157893, + "loss": 0.6194, + "step": 907 + }, + { + "epoch": 1.007349882124532, + "grad_norm": 0.20425230264663696, + "learning_rate": 0.00022142269736842102, + "loss": 0.7142, + "step": 908 + }, + { + "epoch": 1.0084592982942726, + "grad_norm": 0.22876769304275513, + "learning_rate": 0.00022129934210526314, + "loss": 0.5316, + "step": 909 + }, + { + "epoch": 1.0095687144640133, + "grad_norm": 0.3051469624042511, + "learning_rate": 0.00022117598684210523, + "loss": 0.526, + "step": 910 + }, + { + "epoch": 1.010678130633754, + "grad_norm": 0.31023165583610535, + "learning_rate": 0.00022105263157894733, + "loss": 0.6281, + "step": 911 + }, + { + "epoch": 1.0117875468034947, + "grad_norm": 0.2776643633842468, + "learning_rate": 0.00022092927631578945, + "loss": 0.6222, + "step": 912 + }, + { + "epoch": 1.0128969629732354, + "grad_norm": 0.17718107998371124, + "learning_rate": 0.00022080592105263157, + "loss": 0.497, + "step": 913 + }, + { + "epoch": 1.014006379142976, + "grad_norm": 0.18926644325256348, + "learning_rate": 0.0002206825657894737, + "loss": 0.3907, + "step": 914 + }, + { + "epoch": 1.0151157953127168, + "grad_norm": 0.21219785511493683, + "learning_rate": 0.00022055921052631578, + "loss": 0.7063, + "step": 915 + }, + { + "epoch": 1.0162252114824573, + "grad_norm": 0.30529090762138367, + "learning_rate": 0.00022043585526315787, + "loss": 0.5273, + "step": 916 + }, + { + "epoch": 1.017334627652198, + "grad_norm": 0.2160457968711853, + "learning_rate": 0.00022031249999999997, + "loss": 0.6503, + "step": 917 + }, + { + "epoch": 1.0184440438219386, + "grad_norm": 0.19193992018699646, + "learning_rate": 0.0002201891447368421, + "loss": 0.5096, + "step": 918 + }, + { + "epoch": 1.0195534599916793, + "grad_norm": 0.1873706579208374, + "learning_rate": 0.00022006578947368418, + "loss": 0.5354, + "step": 919 + }, + { + "epoch": 1.02066287616142, + "grad_norm": 0.183074951171875, + "learning_rate": 0.00021994243421052627, + "loss": 0.4163, + "step": 920 + }, + { + "epoch": 1.0217722923311607, + "grad_norm": 0.15584836900234222, + "learning_rate": 0.0002198190789473684, + "loss": 0.3087, + "step": 921 + }, + { + "epoch": 1.0228817085009014, + "grad_norm": 0.1514248251914978, + "learning_rate": 0.00021969572368421052, + "loss": 0.492, + "step": 922 + }, + { + "epoch": 1.0239911246706421, + "grad_norm": 0.11994735896587372, + "learning_rate": 0.00021957236842105264, + "loss": 0.4445, + "step": 923 + }, + { + "epoch": 1.0251005408403828, + "grad_norm": 0.23181642591953278, + "learning_rate": 0.00021944901315789473, + "loss": 0.5837, + "step": 924 + }, + { + "epoch": 1.0262099570101235, + "grad_norm": 0.24097567796707153, + "learning_rate": 0.00021932565789473682, + "loss": 0.5709, + "step": 925 + }, + { + "epoch": 1.027319373179864, + "grad_norm": 0.2901909649372101, + "learning_rate": 0.00021920230263157894, + "loss": 0.6171, + "step": 926 + }, + { + "epoch": 1.0284287893496047, + "grad_norm": 0.2848079800605774, + "learning_rate": 0.00021907894736842104, + "loss": 0.5066, + "step": 927 + }, + { + "epoch": 1.0295382055193454, + "grad_norm": 0.26711946725845337, + "learning_rate": 0.00021895559210526313, + "loss": 0.4874, + "step": 928 + }, + { + "epoch": 1.030647621689086, + "grad_norm": 0.20230190455913544, + "learning_rate": 0.00021883223684210525, + "loss": 0.4349, + "step": 929 + }, + { + "epoch": 1.0317570378588268, + "grad_norm": 0.2531988322734833, + "learning_rate": 0.00021870888157894734, + "loss": 0.5349, + "step": 930 + }, + { + "epoch": 1.0328664540285675, + "grad_norm": 0.19993358850479126, + "learning_rate": 0.00021858552631578944, + "loss": 0.4689, + "step": 931 + }, + { + "epoch": 1.0339758701983082, + "grad_norm": 0.36045175790786743, + "learning_rate": 0.00021846217105263158, + "loss": 0.5803, + "step": 932 + }, + { + "epoch": 1.0350852863680489, + "grad_norm": 0.2449137419462204, + "learning_rate": 0.00021833881578947368, + "loss": 0.5587, + "step": 933 + }, + { + "epoch": 1.0361947025377896, + "grad_norm": 0.21992360055446625, + "learning_rate": 0.00021821546052631577, + "loss": 0.4949, + "step": 934 + }, + { + "epoch": 1.0373041187075303, + "grad_norm": 0.22803427278995514, + "learning_rate": 0.0002180921052631579, + "loss": 0.3884, + "step": 935 + }, + { + "epoch": 1.0384135348772707, + "grad_norm": 0.31868723034858704, + "learning_rate": 0.00021796874999999998, + "loss": 0.4716, + "step": 936 + }, + { + "epoch": 1.0395229510470114, + "grad_norm": 0.2443029284477234, + "learning_rate": 0.00021784539473684208, + "loss": 0.4942, + "step": 937 + }, + { + "epoch": 1.0406323672167521, + "grad_norm": 0.16727162897586823, + "learning_rate": 0.0002177220394736842, + "loss": 0.5104, + "step": 938 + }, + { + "epoch": 1.0417417833864928, + "grad_norm": 0.2707812488079071, + "learning_rate": 0.0002175986842105263, + "loss": 0.7545, + "step": 939 + }, + { + "epoch": 1.0428511995562335, + "grad_norm": 0.20524704456329346, + "learning_rate": 0.00021747532894736838, + "loss": 0.4782, + "step": 940 + }, + { + "epoch": 1.0439606157259742, + "grad_norm": 0.2047165483236313, + "learning_rate": 0.00021735197368421053, + "loss": 0.6799, + "step": 941 + }, + { + "epoch": 1.045070031895715, + "grad_norm": 0.21171070635318756, + "learning_rate": 0.00021722861842105262, + "loss": 0.6063, + "step": 942 + }, + { + "epoch": 1.0461794480654556, + "grad_norm": 0.2965310215950012, + "learning_rate": 0.00021710526315789472, + "loss": 0.8489, + "step": 943 + }, + { + "epoch": 1.0472888642351963, + "grad_norm": 0.18332551419734955, + "learning_rate": 0.00021698190789473684, + "loss": 0.5516, + "step": 944 + }, + { + "epoch": 1.048398280404937, + "grad_norm": 0.21715614199638367, + "learning_rate": 0.00021685855263157893, + "loss": 0.5219, + "step": 945 + }, + { + "epoch": 1.0495076965746777, + "grad_norm": 0.2923019230365753, + "learning_rate": 0.00021673519736842102, + "loss": 0.7848, + "step": 946 + }, + { + "epoch": 1.0506171127444182, + "grad_norm": 0.259674072265625, + "learning_rate": 0.00021661184210526314, + "loss": 0.3492, + "step": 947 + }, + { + "epoch": 1.0517265289141589, + "grad_norm": 0.158514603972435, + "learning_rate": 0.00021648848684210524, + "loss": 0.5425, + "step": 948 + }, + { + "epoch": 1.0528359450838995, + "grad_norm": 0.1703483611345291, + "learning_rate": 0.00021636513157894733, + "loss": 0.4018, + "step": 949 + }, + { + "epoch": 1.0539453612536402, + "grad_norm": 0.1840825378894806, + "learning_rate": 0.00021624177631578945, + "loss": 0.3122, + "step": 950 + }, + { + "epoch": 1.055054777423381, + "grad_norm": 0.3050348162651062, + "learning_rate": 0.00021611842105263157, + "loss": 0.6332, + "step": 951 + }, + { + "epoch": 1.0561641935931216, + "grad_norm": 0.2472129911184311, + "learning_rate": 0.00021599506578947366, + "loss": 0.4852, + "step": 952 + }, + { + "epoch": 1.0572736097628623, + "grad_norm": 0.18661095201969147, + "learning_rate": 0.00021587171052631578, + "loss": 0.3293, + "step": 953 + }, + { + "epoch": 1.058383025932603, + "grad_norm": 0.3015456795692444, + "learning_rate": 0.00021574835526315788, + "loss": 0.5981, + "step": 954 + }, + { + "epoch": 1.0594924421023437, + "grad_norm": 0.24642759561538696, + "learning_rate": 0.00021562499999999997, + "loss": 0.4401, + "step": 955 + }, + { + "epoch": 1.0606018582720844, + "grad_norm": 0.23688514530658722, + "learning_rate": 0.0002155016447368421, + "loss": 0.6831, + "step": 956 + }, + { + "epoch": 1.061711274441825, + "grad_norm": 0.20036476850509644, + "learning_rate": 0.00021537828947368418, + "loss": 0.4985, + "step": 957 + }, + { + "epoch": 1.0628206906115656, + "grad_norm": 0.21736058592796326, + "learning_rate": 0.00021525493421052628, + "loss": 0.7388, + "step": 958 + }, + { + "epoch": 1.0639301067813063, + "grad_norm": 0.16539451479911804, + "learning_rate": 0.0002151315789473684, + "loss": 0.6139, + "step": 959 + }, + { + "epoch": 1.065039522951047, + "grad_norm": 0.2896292209625244, + "learning_rate": 0.00021500822368421052, + "loss": 0.6531, + "step": 960 + }, + { + "epoch": 1.0661489391207877, + "grad_norm": 0.2285917103290558, + "learning_rate": 0.00021488486842105264, + "loss": 0.4059, + "step": 961 + }, + { + "epoch": 1.0672583552905284, + "grad_norm": 0.2950262725353241, + "learning_rate": 0.00021476151315789473, + "loss": 0.6509, + "step": 962 + }, + { + "epoch": 1.068367771460269, + "grad_norm": 0.30050045251846313, + "learning_rate": 0.00021463815789473682, + "loss": 0.513, + "step": 963 + }, + { + "epoch": 1.0694771876300098, + "grad_norm": 0.22528204321861267, + "learning_rate": 0.00021451480263157894, + "loss": 0.4885, + "step": 964 + }, + { + "epoch": 1.0705866037997505, + "grad_norm": 0.18981274962425232, + "learning_rate": 0.00021439144736842104, + "loss": 0.4872, + "step": 965 + }, + { + "epoch": 1.0716960199694912, + "grad_norm": 0.17064198851585388, + "learning_rate": 0.00021426809210526313, + "loss": 0.4601, + "step": 966 + }, + { + "epoch": 1.0728054361392316, + "grad_norm": 0.2478228509426117, + "learning_rate": 0.00021414473684210522, + "loss": 0.4115, + "step": 967 + }, + { + "epoch": 1.0739148523089723, + "grad_norm": 0.22317509353160858, + "learning_rate": 0.00021402138157894734, + "loss": 0.4975, + "step": 968 + }, + { + "epoch": 1.075024268478713, + "grad_norm": 0.26215845346450806, + "learning_rate": 0.00021389802631578944, + "loss": 0.5119, + "step": 969 + }, + { + "epoch": 1.0761336846484537, + "grad_norm": 0.2661817967891693, + "learning_rate": 0.00021377467105263158, + "loss": 0.4886, + "step": 970 + }, + { + "epoch": 1.0772431008181944, + "grad_norm": 0.3064764738082886, + "learning_rate": 0.00021365131578947368, + "loss": 0.4684, + "step": 971 + }, + { + "epoch": 1.078352516987935, + "grad_norm": 0.1887666881084442, + "learning_rate": 0.00021352796052631577, + "loss": 0.5324, + "step": 972 + }, + { + "epoch": 1.0794619331576758, + "grad_norm": 0.446283221244812, + "learning_rate": 0.0002134046052631579, + "loss": 0.4313, + "step": 973 + }, + { + "epoch": 1.0805713493274165, + "grad_norm": 0.33540961146354675, + "learning_rate": 0.00021328124999999998, + "loss": 0.6241, + "step": 974 + }, + { + "epoch": 1.0816807654971572, + "grad_norm": 0.234646275639534, + "learning_rate": 0.00021315789473684208, + "loss": 0.3787, + "step": 975 + }, + { + "epoch": 1.0827901816668979, + "grad_norm": 0.21899907290935516, + "learning_rate": 0.0002130345394736842, + "loss": 0.5201, + "step": 976 + }, + { + "epoch": 1.0838995978366386, + "grad_norm": 0.18094147741794586, + "learning_rate": 0.0002129111842105263, + "loss": 0.44, + "step": 977 + }, + { + "epoch": 1.085009014006379, + "grad_norm": 0.1795743703842163, + "learning_rate": 0.00021278782894736838, + "loss": 0.4502, + "step": 978 + }, + { + "epoch": 1.0861184301761198, + "grad_norm": 0.24622996151447296, + "learning_rate": 0.00021266447368421053, + "loss": 0.4467, + "step": 979 + }, + { + "epoch": 1.0872278463458604, + "grad_norm": 0.32457393407821655, + "learning_rate": 0.00021254111842105262, + "loss": 0.5973, + "step": 980 + }, + { + "epoch": 1.0883372625156011, + "grad_norm": 0.18922576308250427, + "learning_rate": 0.00021241776315789472, + "loss": 0.2973, + "step": 981 + }, + { + "epoch": 1.0894466786853418, + "grad_norm": 0.2579767405986786, + "learning_rate": 0.00021229440789473684, + "loss": 0.4825, + "step": 982 + }, + { + "epoch": 1.0905560948550825, + "grad_norm": 0.26494085788726807, + "learning_rate": 0.00021217105263157893, + "loss": 0.7183, + "step": 983 + }, + { + "epoch": 1.0916655110248232, + "grad_norm": 0.26830682158470154, + "learning_rate": 0.00021204769736842102, + "loss": 0.6421, + "step": 984 + }, + { + "epoch": 1.092774927194564, + "grad_norm": 0.21053458750247955, + "learning_rate": 0.00021192434210526314, + "loss": 0.3367, + "step": 985 + }, + { + "epoch": 1.0938843433643046, + "grad_norm": 0.27601686120033264, + "learning_rate": 0.00021180098684210524, + "loss": 0.5145, + "step": 986 + }, + { + "epoch": 1.094993759534045, + "grad_norm": 0.28960034251213074, + "learning_rate": 0.00021167763157894733, + "loss": 0.548, + "step": 987 + }, + { + "epoch": 1.0961031757037858, + "grad_norm": 0.1937156766653061, + "learning_rate": 0.00021155427631578945, + "loss": 0.3746, + "step": 988 + }, + { + "epoch": 1.0972125918735265, + "grad_norm": 0.22715970873832703, + "learning_rate": 0.00021143092105263157, + "loss": 0.6459, + "step": 989 + }, + { + "epoch": 1.0983220080432672, + "grad_norm": 0.2094552218914032, + "learning_rate": 0.00021130756578947366, + "loss": 0.4155, + "step": 990 + }, + { + "epoch": 1.0994314242130079, + "grad_norm": 0.23469318449497223, + "learning_rate": 0.00021118421052631578, + "loss": 0.6588, + "step": 991 + }, + { + "epoch": 1.1005408403827486, + "grad_norm": 0.18322822451591492, + "learning_rate": 0.00021106085526315788, + "loss": 0.4636, + "step": 992 + }, + { + "epoch": 1.1016502565524893, + "grad_norm": 0.2863262891769409, + "learning_rate": 0.00021093749999999997, + "loss": 0.4921, + "step": 993 + }, + { + "epoch": 1.10275967272223, + "grad_norm": 0.2982548177242279, + "learning_rate": 0.0002108141447368421, + "loss": 0.6896, + "step": 994 + }, + { + "epoch": 1.1038690888919707, + "grad_norm": 0.35214439034461975, + "learning_rate": 0.00021069078947368418, + "loss": 0.4343, + "step": 995 + }, + { + "epoch": 1.1049785050617114, + "grad_norm": 0.19073866307735443, + "learning_rate": 0.00021056743421052628, + "loss": 0.4724, + "step": 996 + }, + { + "epoch": 1.106087921231452, + "grad_norm": 0.24945704638957977, + "learning_rate": 0.0002104440789473684, + "loss": 0.3815, + "step": 997 + }, + { + "epoch": 1.1071973374011925, + "grad_norm": 0.27915990352630615, + "learning_rate": 0.00021032072368421052, + "loss": 0.5862, + "step": 998 + }, + { + "epoch": 1.1083067535709332, + "grad_norm": 0.3030833899974823, + "learning_rate": 0.00021019736842105264, + "loss": 0.4567, + "step": 999 + }, + { + "epoch": 1.109416169740674, + "grad_norm": 0.22396822273731232, + "learning_rate": 0.00021007401315789473, + "loss": 0.4252, + "step": 1000 + }, + { + "epoch": 1.1105255859104146, + "grad_norm": 0.285520076751709, + "learning_rate": 0.00020995065789473682, + "loss": 0.3036, + "step": 1001 + }, + { + "epoch": 1.1116350020801553, + "grad_norm": 0.2318202704191208, + "learning_rate": 0.00020982730263157892, + "loss": 0.44, + "step": 1002 + }, + { + "epoch": 1.112744418249896, + "grad_norm": 0.24701035022735596, + "learning_rate": 0.00020970394736842104, + "loss": 0.5542, + "step": 1003 + }, + { + "epoch": 1.1138538344196367, + "grad_norm": 0.25920355319976807, + "learning_rate": 0.00020958059210526313, + "loss": 0.5066, + "step": 1004 + }, + { + "epoch": 1.1149632505893774, + "grad_norm": 0.6409327387809753, + "learning_rate": 0.00020945723684210522, + "loss": 0.6025, + "step": 1005 + }, + { + "epoch": 1.116072666759118, + "grad_norm": 0.469032883644104, + "learning_rate": 0.00020933388157894734, + "loss": 0.4099, + "step": 1006 + }, + { + "epoch": 1.1171820829288588, + "grad_norm": 0.181956484913826, + "learning_rate": 0.00020921052631578944, + "loss": 0.353, + "step": 1007 + }, + { + "epoch": 1.1182914990985993, + "grad_norm": 0.2923588752746582, + "learning_rate": 0.00020908717105263158, + "loss": 0.5331, + "step": 1008 + }, + { + "epoch": 1.11940091526834, + "grad_norm": 0.29170429706573486, + "learning_rate": 0.00020896381578947368, + "loss": 0.6123, + "step": 1009 + }, + { + "epoch": 1.1205103314380807, + "grad_norm": 0.23297952115535736, + "learning_rate": 0.00020884046052631577, + "loss": 0.3093, + "step": 1010 + }, + { + "epoch": 1.1216197476078213, + "grad_norm": 0.3802937865257263, + "learning_rate": 0.0002087171052631579, + "loss": 0.5309, + "step": 1011 + }, + { + "epoch": 1.122729163777562, + "grad_norm": 0.21260103583335876, + "learning_rate": 0.00020859374999999998, + "loss": 0.5288, + "step": 1012 + }, + { + "epoch": 1.1238385799473027, + "grad_norm": 0.22543269395828247, + "learning_rate": 0.00020847039473684208, + "loss": 0.4474, + "step": 1013 + }, + { + "epoch": 1.1249479961170434, + "grad_norm": 0.23437976837158203, + "learning_rate": 0.0002083470394736842, + "loss": 0.622, + "step": 1014 + }, + { + "epoch": 1.1260574122867841, + "grad_norm": 0.22682055830955505, + "learning_rate": 0.0002082236842105263, + "loss": 0.4028, + "step": 1015 + }, + { + "epoch": 1.1271668284565248, + "grad_norm": 0.27149879932403564, + "learning_rate": 0.00020810032894736838, + "loss": 0.3926, + "step": 1016 + }, + { + "epoch": 1.1282762446262655, + "grad_norm": 0.21635794639587402, + "learning_rate": 0.00020797697368421053, + "loss": 0.5204, + "step": 1017 + }, + { + "epoch": 1.129385660796006, + "grad_norm": 0.29137322306632996, + "learning_rate": 0.00020785361842105263, + "loss": 0.5625, + "step": 1018 + }, + { + "epoch": 1.1304950769657467, + "grad_norm": 0.2547966241836548, + "learning_rate": 0.00020773026315789472, + "loss": 0.7025, + "step": 1019 + }, + { + "epoch": 1.1316044931354874, + "grad_norm": 0.34155234694480896, + "learning_rate": 0.00020760690789473684, + "loss": 0.5504, + "step": 1020 + }, + { + "epoch": 1.132713909305228, + "grad_norm": 0.30655375123023987, + "learning_rate": 0.00020748355263157893, + "loss": 0.8426, + "step": 1021 + }, + { + "epoch": 1.1338233254749688, + "grad_norm": 0.24423359334468842, + "learning_rate": 0.00020736019736842102, + "loss": 0.4466, + "step": 1022 + }, + { + "epoch": 1.1349327416447095, + "grad_norm": 0.24093151092529297, + "learning_rate": 0.00020723684210526315, + "loss": 0.6558, + "step": 1023 + }, + { + "epoch": 1.1360421578144502, + "grad_norm": 0.2106793373823166, + "learning_rate": 0.00020711348684210524, + "loss": 0.4964, + "step": 1024 + }, + { + "epoch": 1.1371515739841909, + "grad_norm": 0.27556413412094116, + "learning_rate": 0.00020699013157894733, + "loss": 0.4517, + "step": 1025 + }, + { + "epoch": 1.1382609901539316, + "grad_norm": 0.28604790568351746, + "learning_rate": 0.00020686677631578945, + "loss": 0.606, + "step": 1026 + }, + { + "epoch": 1.1393704063236723, + "grad_norm": 0.2911072075366974, + "learning_rate": 0.00020674342105263157, + "loss": 0.4299, + "step": 1027 + }, + { + "epoch": 1.140479822493413, + "grad_norm": 0.19676262140274048, + "learning_rate": 0.00020662006578947367, + "loss": 0.3269, + "step": 1028 + }, + { + "epoch": 1.1415892386631534, + "grad_norm": 0.2523801624774933, + "learning_rate": 0.00020649671052631579, + "loss": 0.3888, + "step": 1029 + }, + { + "epoch": 1.1426986548328941, + "grad_norm": 0.2896265387535095, + "learning_rate": 0.00020637335526315788, + "loss": 0.4757, + "step": 1030 + }, + { + "epoch": 1.1438080710026348, + "grad_norm": 0.24346299469470978, + "learning_rate": 0.00020624999999999997, + "loss": 0.5065, + "step": 1031 + }, + { + "epoch": 1.1449174871723755, + "grad_norm": 0.2244485467672348, + "learning_rate": 0.0002061266447368421, + "loss": 0.4212, + "step": 1032 + }, + { + "epoch": 1.1460269033421162, + "grad_norm": 0.2615904211997986, + "learning_rate": 0.00020600328947368419, + "loss": 0.3755, + "step": 1033 + }, + { + "epoch": 1.147136319511857, + "grad_norm": 0.2203037440776825, + "learning_rate": 0.00020587993421052628, + "loss": 0.7468, + "step": 1034 + }, + { + "epoch": 1.1482457356815976, + "grad_norm": 0.2502787709236145, + "learning_rate": 0.0002057565789473684, + "loss": 0.4717, + "step": 1035 + }, + { + "epoch": 1.1493551518513383, + "grad_norm": 0.23755724728107452, + "learning_rate": 0.00020563322368421052, + "loss": 0.4367, + "step": 1036 + }, + { + "epoch": 1.150464568021079, + "grad_norm": 0.2541312575340271, + "learning_rate": 0.00020550986842105264, + "loss": 0.3824, + "step": 1037 + }, + { + "epoch": 1.1515739841908195, + "grad_norm": 0.2297431081533432, + "learning_rate": 0.00020538651315789473, + "loss": 0.5744, + "step": 1038 + }, + { + "epoch": 1.1526834003605602, + "grad_norm": 0.25546327233314514, + "learning_rate": 0.00020526315789473683, + "loss": 0.5484, + "step": 1039 + }, + { + "epoch": 1.1537928165303009, + "grad_norm": 0.27455171942710876, + "learning_rate": 0.00020513980263157892, + "loss": 0.4961, + "step": 1040 + }, + { + "epoch": 1.1549022327000416, + "grad_norm": 0.221107617020607, + "learning_rate": 0.00020501644736842104, + "loss": 0.6225, + "step": 1041 + }, + { + "epoch": 1.1560116488697822, + "grad_norm": 0.3334260582923889, + "learning_rate": 0.00020489309210526313, + "loss": 0.5827, + "step": 1042 + }, + { + "epoch": 1.157121065039523, + "grad_norm": 0.2542990446090698, + "learning_rate": 0.00020476973684210523, + "loss": 0.4444, + "step": 1043 + }, + { + "epoch": 1.1582304812092636, + "grad_norm": 0.24342231452465057, + "learning_rate": 0.00020464638157894735, + "loss": 0.4379, + "step": 1044 + }, + { + "epoch": 1.1593398973790043, + "grad_norm": 0.27247416973114014, + "learning_rate": 0.00020452302631578944, + "loss": 0.5547, + "step": 1045 + }, + { + "epoch": 1.160449313548745, + "grad_norm": 0.21573393046855927, + "learning_rate": 0.00020439967105263159, + "loss": 0.4349, + "step": 1046 + }, + { + "epoch": 1.1615587297184857, + "grad_norm": 0.2523028552532196, + "learning_rate": 0.00020427631578947368, + "loss": 0.4237, + "step": 1047 + }, + { + "epoch": 1.1626681458882264, + "grad_norm": 0.30524906516075134, + "learning_rate": 0.00020415296052631577, + "loss": 0.3916, + "step": 1048 + }, + { + "epoch": 1.163777562057967, + "grad_norm": 0.3323182463645935, + "learning_rate": 0.0002040296052631579, + "loss": 0.6462, + "step": 1049 + }, + { + "epoch": 1.1648869782277076, + "grad_norm": 0.31420886516571045, + "learning_rate": 0.00020390624999999999, + "loss": 0.4049, + "step": 1050 + }, + { + "epoch": 1.1659963943974483, + "grad_norm": 0.32108211517333984, + "learning_rate": 0.00020378289473684208, + "loss": 0.4562, + "step": 1051 + }, + { + "epoch": 1.167105810567189, + "grad_norm": 0.1855727732181549, + "learning_rate": 0.00020365953947368417, + "loss": 0.4673, + "step": 1052 + }, + { + "epoch": 1.1682152267369297, + "grad_norm": 0.3211022615432739, + "learning_rate": 0.0002035361842105263, + "loss": 0.3942, + "step": 1053 + }, + { + "epoch": 1.1693246429066704, + "grad_norm": 0.25351840257644653, + "learning_rate": 0.00020341282894736839, + "loss": 0.5996, + "step": 1054 + }, + { + "epoch": 1.170434059076411, + "grad_norm": 0.3350581228733063, + "learning_rate": 0.00020328947368421053, + "loss": 0.9422, + "step": 1055 + }, + { + "epoch": 1.1715434752461518, + "grad_norm": 0.28251969814300537, + "learning_rate": 0.00020316611842105263, + "loss": 0.5418, + "step": 1056 + }, + { + "epoch": 1.1726528914158925, + "grad_norm": 0.25546523928642273, + "learning_rate": 0.00020304276315789472, + "loss": 0.5415, + "step": 1057 + }, + { + "epoch": 1.173762307585633, + "grad_norm": 0.2648818790912628, + "learning_rate": 0.00020291940789473684, + "loss": 0.5584, + "step": 1058 + }, + { + "epoch": 1.1748717237553739, + "grad_norm": 0.3260975480079651, + "learning_rate": 0.00020279605263157893, + "loss": 0.5427, + "step": 1059 + }, + { + "epoch": 1.1759811399251143, + "grad_norm": 0.2364053875207901, + "learning_rate": 0.00020267269736842103, + "loss": 0.5468, + "step": 1060 + }, + { + "epoch": 1.177090556094855, + "grad_norm": 0.3164118230342865, + "learning_rate": 0.00020254934210526315, + "loss": 0.5242, + "step": 1061 + }, + { + "epoch": 1.1781999722645957, + "grad_norm": 0.22556494176387787, + "learning_rate": 0.00020242598684210524, + "loss": 0.6513, + "step": 1062 + }, + { + "epoch": 1.1793093884343364, + "grad_norm": 0.27951955795288086, + "learning_rate": 0.00020230263157894733, + "loss": 0.6117, + "step": 1063 + }, + { + "epoch": 1.180418804604077, + "grad_norm": 0.271635502576828, + "learning_rate": 0.00020217927631578945, + "loss": 0.3996, + "step": 1064 + }, + { + "epoch": 1.1815282207738178, + "grad_norm": 0.23386971652507782, + "learning_rate": 0.00020205592105263157, + "loss": 0.4866, + "step": 1065 + }, + { + "epoch": 1.1826376369435585, + "grad_norm": 0.2693704664707184, + "learning_rate": 0.00020193256578947367, + "loss": 0.6313, + "step": 1066 + }, + { + "epoch": 1.1837470531132992, + "grad_norm": 0.2502618432044983, + "learning_rate": 0.0002018092105263158, + "loss": 0.5445, + "step": 1067 + }, + { + "epoch": 1.18485646928304, + "grad_norm": 0.22942642867565155, + "learning_rate": 0.00020168585526315788, + "loss": 0.4443, + "step": 1068 + }, + { + "epoch": 1.1859658854527804, + "grad_norm": 0.23630082607269287, + "learning_rate": 0.00020156249999999997, + "loss": 0.5264, + "step": 1069 + }, + { + "epoch": 1.187075301622521, + "grad_norm": 0.29252779483795166, + "learning_rate": 0.0002014391447368421, + "loss": 0.5528, + "step": 1070 + }, + { + "epoch": 1.1881847177922618, + "grad_norm": 0.2584315538406372, + "learning_rate": 0.0002013157894736842, + "loss": 0.5017, + "step": 1071 + }, + { + "epoch": 1.1892941339620025, + "grad_norm": 0.28517627716064453, + "learning_rate": 0.00020119243421052628, + "loss": 0.4536, + "step": 1072 + }, + { + "epoch": 1.1904035501317431, + "grad_norm": 0.432355672121048, + "learning_rate": 0.0002010690789473684, + "loss": 0.592, + "step": 1073 + }, + { + "epoch": 1.1915129663014838, + "grad_norm": 0.3467410206794739, + "learning_rate": 0.00020094572368421052, + "loss": 0.7285, + "step": 1074 + }, + { + "epoch": 1.1926223824712245, + "grad_norm": 0.29943886399269104, + "learning_rate": 0.0002008223684210526, + "loss": 0.7591, + "step": 1075 + }, + { + "epoch": 1.1937317986409652, + "grad_norm": 0.17552253603935242, + "learning_rate": 0.00020069901315789473, + "loss": 0.2609, + "step": 1076 + }, + { + "epoch": 1.194841214810706, + "grad_norm": 0.20625296235084534, + "learning_rate": 0.00020057565789473683, + "loss": 0.3158, + "step": 1077 + }, + { + "epoch": 1.1959506309804466, + "grad_norm": 0.27414992451667786, + "learning_rate": 0.00020045230263157892, + "loss": 0.3384, + "step": 1078 + }, + { + "epoch": 1.1970600471501873, + "grad_norm": 0.2584741711616516, + "learning_rate": 0.00020032894736842104, + "loss": 0.3626, + "step": 1079 + }, + { + "epoch": 1.1981694633199278, + "grad_norm": 0.2000085860490799, + "learning_rate": 0.00020020559210526313, + "loss": 0.3534, + "step": 1080 + }, + { + "epoch": 1.1992788794896685, + "grad_norm": 0.4106660783290863, + "learning_rate": 0.00020008223684210523, + "loss": 0.4994, + "step": 1081 + }, + { + "epoch": 1.2003882956594092, + "grad_norm": 0.3149929940700531, + "learning_rate": 0.00019995888157894735, + "loss": 0.5106, + "step": 1082 + }, + { + "epoch": 1.2014977118291499, + "grad_norm": 0.27537012100219727, + "learning_rate": 0.00019983552631578944, + "loss": 0.403, + "step": 1083 + }, + { + "epoch": 1.2026071279988906, + "grad_norm": 0.27250340580940247, + "learning_rate": 0.0001997121710526316, + "loss": 0.4476, + "step": 1084 + }, + { + "epoch": 1.2037165441686313, + "grad_norm": 0.3000098168849945, + "learning_rate": 0.00019958881578947368, + "loss": 0.5365, + "step": 1085 + }, + { + "epoch": 1.204825960338372, + "grad_norm": 0.2616446912288666, + "learning_rate": 0.00019946546052631577, + "loss": 0.4966, + "step": 1086 + }, + { + "epoch": 1.2059353765081127, + "grad_norm": 0.3749598264694214, + "learning_rate": 0.0001993421052631579, + "loss": 0.4181, + "step": 1087 + }, + { + "epoch": 1.2070447926778534, + "grad_norm": 0.22599004209041595, + "learning_rate": 0.00019921875, + "loss": 0.4952, + "step": 1088 + }, + { + "epoch": 1.2081542088475938, + "grad_norm": 0.3596106469631195, + "learning_rate": 0.00019909539473684208, + "loss": 0.4713, + "step": 1089 + }, + { + "epoch": 1.2092636250173345, + "grad_norm": 0.32484951615333557, + "learning_rate": 0.00019897203947368417, + "loss": 0.5465, + "step": 1090 + }, + { + "epoch": 1.2103730411870752, + "grad_norm": 0.3571338951587677, + "learning_rate": 0.0001988486842105263, + "loss": 0.6788, + "step": 1091 + }, + { + "epoch": 1.211482457356816, + "grad_norm": 0.34311506152153015, + "learning_rate": 0.0001987253289473684, + "loss": 0.5347, + "step": 1092 + }, + { + "epoch": 1.2125918735265566, + "grad_norm": 0.32343587279319763, + "learning_rate": 0.00019860197368421053, + "loss": 0.3366, + "step": 1093 + }, + { + "epoch": 1.2137012896962973, + "grad_norm": 0.22507165372371674, + "learning_rate": 0.00019847861842105263, + "loss": 0.5374, + "step": 1094 + }, + { + "epoch": 1.214810705866038, + "grad_norm": 0.25680041313171387, + "learning_rate": 0.00019835526315789472, + "loss": 0.3549, + "step": 1095 + }, + { + "epoch": 1.2159201220357787, + "grad_norm": 0.2538761496543884, + "learning_rate": 0.00019823190789473684, + "loss": 0.4335, + "step": 1096 + }, + { + "epoch": 1.2170295382055194, + "grad_norm": 0.2690007984638214, + "learning_rate": 0.00019810855263157893, + "loss": 0.7695, + "step": 1097 + }, + { + "epoch": 1.21813895437526, + "grad_norm": 0.19778668880462646, + "learning_rate": 0.00019798519736842103, + "loss": 0.3248, + "step": 1098 + }, + { + "epoch": 1.2192483705450008, + "grad_norm": 0.23934458196163177, + "learning_rate": 0.00019786184210526315, + "loss": 0.3444, + "step": 1099 + }, + { + "epoch": 1.2203577867147413, + "grad_norm": 0.2791898846626282, + "learning_rate": 0.00019773848684210524, + "loss": 0.4964, + "step": 1100 + }, + { + "epoch": 1.221467202884482, + "grad_norm": 0.24607348442077637, + "learning_rate": 0.00019761513157894733, + "loss": 0.59, + "step": 1101 + }, + { + "epoch": 1.2225766190542227, + "grad_norm": 0.27295032143592834, + "learning_rate": 0.00019749177631578943, + "loss": 0.4905, + "step": 1102 + }, + { + "epoch": 1.2236860352239634, + "grad_norm": 0.2771230936050415, + "learning_rate": 0.00019736842105263157, + "loss": 0.66, + "step": 1103 + }, + { + "epoch": 1.224795451393704, + "grad_norm": 0.30468347668647766, + "learning_rate": 0.00019724506578947367, + "loss": 0.4982, + "step": 1104 + }, + { + "epoch": 1.2259048675634447, + "grad_norm": 0.2552284300327301, + "learning_rate": 0.0001971217105263158, + "loss": 0.3414, + "step": 1105 + }, + { + "epoch": 1.2270142837331854, + "grad_norm": 0.3029981255531311, + "learning_rate": 0.00019699835526315788, + "loss": 0.7422, + "step": 1106 + }, + { + "epoch": 1.2281236999029261, + "grad_norm": 0.27999430894851685, + "learning_rate": 0.00019687499999999997, + "loss": 0.5393, + "step": 1107 + }, + { + "epoch": 1.2292331160726668, + "grad_norm": 0.25855761766433716, + "learning_rate": 0.0001967516447368421, + "loss": 0.4018, + "step": 1108 + }, + { + "epoch": 1.2303425322424075, + "grad_norm": 0.4557599425315857, + "learning_rate": 0.0001966282894736842, + "loss": 0.5082, + "step": 1109 + }, + { + "epoch": 1.2314519484121482, + "grad_norm": 0.2822202742099762, + "learning_rate": 0.00019650493421052628, + "loss": 0.6512, + "step": 1110 + }, + { + "epoch": 1.2325613645818887, + "grad_norm": 0.31011104583740234, + "learning_rate": 0.0001963815789473684, + "loss": 0.3538, + "step": 1111 + }, + { + "epoch": 1.2336707807516294, + "grad_norm": 0.23474127054214478, + "learning_rate": 0.00019625822368421052, + "loss": 0.5368, + "step": 1112 + }, + { + "epoch": 1.23478019692137, + "grad_norm": 0.2853519320487976, + "learning_rate": 0.00019613486842105261, + "loss": 0.5207, + "step": 1113 + }, + { + "epoch": 1.2358896130911108, + "grad_norm": 0.32621482014656067, + "learning_rate": 0.00019601151315789473, + "loss": 0.555, + "step": 1114 + }, + { + "epoch": 1.2369990292608515, + "grad_norm": 0.2757001519203186, + "learning_rate": 0.00019588815789473683, + "loss": 0.6936, + "step": 1115 + }, + { + "epoch": 1.2381084454305922, + "grad_norm": 0.2345856875181198, + "learning_rate": 0.00019576480263157892, + "loss": 0.6683, + "step": 1116 + }, + { + "epoch": 1.2392178616003329, + "grad_norm": 0.2641811668872833, + "learning_rate": 0.00019564144736842104, + "loss": 0.4647, + "step": 1117 + }, + { + "epoch": 1.2403272777700736, + "grad_norm": 0.2421552538871765, + "learning_rate": 0.00019551809210526313, + "loss": 0.8658, + "step": 1118 + }, + { + "epoch": 1.2414366939398143, + "grad_norm": 0.3254922032356262, + "learning_rate": 0.00019539473684210523, + "loss": 0.4028, + "step": 1119 + }, + { + "epoch": 1.2425461101095547, + "grad_norm": 0.27477791905403137, + "learning_rate": 0.00019527138157894735, + "loss": 0.4876, + "step": 1120 + }, + { + "epoch": 1.2436555262792954, + "grad_norm": 0.37873339653015137, + "learning_rate": 0.00019514802631578944, + "loss": 0.5121, + "step": 1121 + }, + { + "epoch": 1.2447649424490361, + "grad_norm": 0.20323941111564636, + "learning_rate": 0.0001950246710526316, + "loss": 0.4327, + "step": 1122 + }, + { + "epoch": 1.2458743586187768, + "grad_norm": 0.2840193510055542, + "learning_rate": 0.00019490131578947368, + "loss": 0.3922, + "step": 1123 + }, + { + "epoch": 1.2469837747885175, + "grad_norm": 0.24474340677261353, + "learning_rate": 0.00019477796052631578, + "loss": 0.5334, + "step": 1124 + }, + { + "epoch": 1.2480931909582582, + "grad_norm": 0.25993865728378296, + "learning_rate": 0.00019465460526315787, + "loss": 0.3783, + "step": 1125 + }, + { + "epoch": 1.249202607127999, + "grad_norm": 0.3326314687728882, + "learning_rate": 0.00019453125, + "loss": 0.4944, + "step": 1126 + }, + { + "epoch": 1.2503120232977396, + "grad_norm": 0.26182398200035095, + "learning_rate": 0.00019440789473684208, + "loss": 0.8275, + "step": 1127 + }, + { + "epoch": 1.2514214394674803, + "grad_norm": 0.26601549983024597, + "learning_rate": 0.00019428453947368417, + "loss": 0.4694, + "step": 1128 + }, + { + "epoch": 1.2525308556372208, + "grad_norm": 0.27001309394836426, + "learning_rate": 0.0001941611842105263, + "loss": 0.4887, + "step": 1129 + }, + { + "epoch": 1.2536402718069617, + "grad_norm": 0.21264316141605377, + "learning_rate": 0.0001940378289473684, + "loss": 0.5031, + "step": 1130 + }, + { + "epoch": 1.2547496879767022, + "grad_norm": 0.1987910121679306, + "learning_rate": 0.00019391447368421054, + "loss": 0.3877, + "step": 1131 + }, + { + "epoch": 1.2558591041464429, + "grad_norm": 0.33749696612358093, + "learning_rate": 0.00019379111842105263, + "loss": 0.4784, + "step": 1132 + }, + { + "epoch": 1.2569685203161836, + "grad_norm": 0.26460304856300354, + "learning_rate": 0.00019366776315789472, + "loss": 0.5015, + "step": 1133 + }, + { + "epoch": 1.2580779364859243, + "grad_norm": 0.2513628602027893, + "learning_rate": 0.00019354440789473684, + "loss": 0.3158, + "step": 1134 + }, + { + "epoch": 1.259187352655665, + "grad_norm": 0.23956595361232758, + "learning_rate": 0.00019342105263157894, + "loss": 0.4248, + "step": 1135 + }, + { + "epoch": 1.2602967688254056, + "grad_norm": 0.33675435185432434, + "learning_rate": 0.00019329769736842103, + "loss": 0.5479, + "step": 1136 + }, + { + "epoch": 1.2614061849951463, + "grad_norm": 0.2596627473831177, + "learning_rate": 0.00019317434210526315, + "loss": 0.4415, + "step": 1137 + }, + { + "epoch": 1.262515601164887, + "grad_norm": 0.3592386543750763, + "learning_rate": 0.00019305098684210524, + "loss": 0.5607, + "step": 1138 + }, + { + "epoch": 1.2636250173346277, + "grad_norm": 0.3122079074382782, + "learning_rate": 0.00019292763157894734, + "loss": 0.891, + "step": 1139 + }, + { + "epoch": 1.2647344335043682, + "grad_norm": 0.31016677618026733, + "learning_rate": 0.00019280427631578943, + "loss": 0.5598, + "step": 1140 + }, + { + "epoch": 1.2658438496741091, + "grad_norm": 0.28511670231819153, + "learning_rate": 0.00019268092105263158, + "loss": 0.5666, + "step": 1141 + }, + { + "epoch": 1.2669532658438496, + "grad_norm": 0.3194934129714966, + "learning_rate": 0.00019255756578947367, + "loss": 0.6659, + "step": 1142 + }, + { + "epoch": 1.2680626820135903, + "grad_norm": 0.2699849307537079, + "learning_rate": 0.0001924342105263158, + "loss": 0.3064, + "step": 1143 + }, + { + "epoch": 1.269172098183331, + "grad_norm": 0.28152981400489807, + "learning_rate": 0.00019231085526315788, + "loss": 0.5019, + "step": 1144 + }, + { + "epoch": 1.2702815143530717, + "grad_norm": 0.34371238946914673, + "learning_rate": 0.00019218749999999998, + "loss": 0.4843, + "step": 1145 + }, + { + "epoch": 1.2713909305228124, + "grad_norm": 0.2585979402065277, + "learning_rate": 0.0001920641447368421, + "loss": 0.4984, + "step": 1146 + }, + { + "epoch": 1.272500346692553, + "grad_norm": 0.32947319746017456, + "learning_rate": 0.0001919407894736842, + "loss": 0.5213, + "step": 1147 + }, + { + "epoch": 1.2736097628622938, + "grad_norm": 0.22969017922878265, + "learning_rate": 0.00019181743421052628, + "loss": 0.429, + "step": 1148 + }, + { + "epoch": 1.2747191790320345, + "grad_norm": 0.22548457980155945, + "learning_rate": 0.0001916940789473684, + "loss": 0.3659, + "step": 1149 + }, + { + "epoch": 1.2758285952017752, + "grad_norm": 0.2924930155277252, + "learning_rate": 0.00019157072368421052, + "loss": 0.4644, + "step": 1150 + }, + { + "epoch": 1.2769380113715156, + "grad_norm": 0.31746160984039307, + "learning_rate": 0.00019144736842105262, + "loss": 0.5486, + "step": 1151 + }, + { + "epoch": 1.2780474275412563, + "grad_norm": 0.3976684808731079, + "learning_rate": 0.00019132401315789474, + "loss": 0.6632, + "step": 1152 + }, + { + "epoch": 1.279156843710997, + "grad_norm": 0.31601187586784363, + "learning_rate": 0.00019120065789473683, + "loss": 0.5185, + "step": 1153 + }, + { + "epoch": 1.2802662598807377, + "grad_norm": 0.2981434762477875, + "learning_rate": 0.00019107730263157892, + "loss": 0.4975, + "step": 1154 + }, + { + "epoch": 1.2813756760504784, + "grad_norm": 0.2698228657245636, + "learning_rate": 0.00019095394736842104, + "loss": 0.6271, + "step": 1155 + }, + { + "epoch": 1.2824850922202191, + "grad_norm": 0.3439400792121887, + "learning_rate": 0.00019083059210526314, + "loss": 0.6612, + "step": 1156 + }, + { + "epoch": 1.2835945083899598, + "grad_norm": 0.27007582783699036, + "learning_rate": 0.00019070723684210523, + "loss": 0.5057, + "step": 1157 + }, + { + "epoch": 1.2847039245597005, + "grad_norm": 0.29291701316833496, + "learning_rate": 0.00019058388157894735, + "loss": 0.5788, + "step": 1158 + }, + { + "epoch": 1.2858133407294412, + "grad_norm": 0.41224008798599243, + "learning_rate": 0.00019046052631578944, + "loss": 0.6633, + "step": 1159 + }, + { + "epoch": 1.2869227568991817, + "grad_norm": 0.3880039155483246, + "learning_rate": 0.00019033717105263156, + "loss": 0.5065, + "step": 1160 + }, + { + "epoch": 1.2880321730689226, + "grad_norm": 0.3320450186729431, + "learning_rate": 0.00019021381578947368, + "loss": 0.6414, + "step": 1161 + }, + { + "epoch": 1.289141589238663, + "grad_norm": 0.2895980477333069, + "learning_rate": 0.00019009046052631578, + "loss": 0.842, + "step": 1162 + }, + { + "epoch": 1.2902510054084038, + "grad_norm": 0.2721669673919678, + "learning_rate": 0.00018996710526315787, + "loss": 0.4492, + "step": 1163 + }, + { + "epoch": 1.2913604215781445, + "grad_norm": 0.28013283014297485, + "learning_rate": 0.00018984375, + "loss": 0.3661, + "step": 1164 + }, + { + "epoch": 1.2924698377478852, + "grad_norm": 0.22734491527080536, + "learning_rate": 0.00018972039473684208, + "loss": 0.5751, + "step": 1165 + }, + { + "epoch": 1.2935792539176258, + "grad_norm": 0.2372591197490692, + "learning_rate": 0.00018959703947368418, + "loss": 0.4572, + "step": 1166 + }, + { + "epoch": 1.2946886700873665, + "grad_norm": 0.3541167080402374, + "learning_rate": 0.0001894736842105263, + "loss": 0.6481, + "step": 1167 + }, + { + "epoch": 1.2957980862571072, + "grad_norm": 0.23032468557357788, + "learning_rate": 0.0001893503289473684, + "loss": 0.3583, + "step": 1168 + }, + { + "epoch": 1.296907502426848, + "grad_norm": 0.20932866632938385, + "learning_rate": 0.00018922697368421054, + "loss": 0.3898, + "step": 1169 + }, + { + "epoch": 1.2980169185965886, + "grad_norm": 0.2942456603050232, + "learning_rate": 0.00018910361842105263, + "loss": 0.4076, + "step": 1170 + }, + { + "epoch": 1.299126334766329, + "grad_norm": 0.28456342220306396, + "learning_rate": 0.00018898026315789472, + "loss": 0.2953, + "step": 1171 + }, + { + "epoch": 1.30023575093607, + "grad_norm": 0.39009782671928406, + "learning_rate": 0.00018885690789473684, + "loss": 0.511, + "step": 1172 + }, + { + "epoch": 1.3013451671058105, + "grad_norm": 0.22601839900016785, + "learning_rate": 0.00018873355263157894, + "loss": 0.5164, + "step": 1173 + }, + { + "epoch": 1.3024545832755512, + "grad_norm": 0.23257453739643097, + "learning_rate": 0.00018861019736842103, + "loss": 0.4081, + "step": 1174 + }, + { + "epoch": 1.3035639994452919, + "grad_norm": 0.2568961977958679, + "learning_rate": 0.00018848684210526312, + "loss": 0.4208, + "step": 1175 + }, + { + "epoch": 1.3046734156150326, + "grad_norm": 0.33718129992485046, + "learning_rate": 0.00018836348684210524, + "loss": 0.3738, + "step": 1176 + }, + { + "epoch": 1.3057828317847733, + "grad_norm": 0.24113743007183075, + "learning_rate": 0.00018824013157894734, + "loss": 0.4668, + "step": 1177 + }, + { + "epoch": 1.306892247954514, + "grad_norm": 0.2619990110397339, + "learning_rate": 0.00018811677631578943, + "loss": 0.6427, + "step": 1178 + }, + { + "epoch": 1.3080016641242547, + "grad_norm": 0.24512606859207153, + "learning_rate": 0.00018799342105263158, + "loss": 0.4494, + "step": 1179 + }, + { + "epoch": 1.3091110802939954, + "grad_norm": 0.23559564352035522, + "learning_rate": 0.00018787006578947367, + "loss": 0.4123, + "step": 1180 + }, + { + "epoch": 1.310220496463736, + "grad_norm": 0.24036440253257751, + "learning_rate": 0.0001877467105263158, + "loss": 0.4689, + "step": 1181 + }, + { + "epoch": 1.3113299126334765, + "grad_norm": 0.29898273944854736, + "learning_rate": 0.00018762335526315788, + "loss": 0.2823, + "step": 1182 + }, + { + "epoch": 1.3124393288032172, + "grad_norm": 0.2730746567249298, + "learning_rate": 0.00018749999999999998, + "loss": 0.7834, + "step": 1183 + }, + { + "epoch": 1.313548744972958, + "grad_norm": 0.30577996373176575, + "learning_rate": 0.0001873766447368421, + "loss": 0.6518, + "step": 1184 + }, + { + "epoch": 1.3146581611426986, + "grad_norm": 0.1852562427520752, + "learning_rate": 0.0001872532894736842, + "loss": 0.3995, + "step": 1185 + }, + { + "epoch": 1.3157675773124393, + "grad_norm": 0.3734980821609497, + "learning_rate": 0.00018712993421052628, + "loss": 0.8202, + "step": 1186 + }, + { + "epoch": 1.31687699348218, + "grad_norm": 0.1997169852256775, + "learning_rate": 0.0001870065789473684, + "loss": 0.5287, + "step": 1187 + }, + { + "epoch": 1.3179864096519207, + "grad_norm": 0.29843178391456604, + "learning_rate": 0.00018688322368421052, + "loss": 0.4611, + "step": 1188 + }, + { + "epoch": 1.3190958258216614, + "grad_norm": 0.24156174063682556, + "learning_rate": 0.00018675986842105262, + "loss": 0.7034, + "step": 1189 + }, + { + "epoch": 1.320205241991402, + "grad_norm": 0.3248981237411499, + "learning_rate": 0.00018663651315789474, + "loss": 0.562, + "step": 1190 + }, + { + "epoch": 1.3213146581611426, + "grad_norm": 0.22303232550621033, + "learning_rate": 0.00018651315789473683, + "loss": 0.428, + "step": 1191 + }, + { + "epoch": 1.3224240743308835, + "grad_norm": 0.32691171765327454, + "learning_rate": 0.00018638980263157892, + "loss": 0.5038, + "step": 1192 + }, + { + "epoch": 1.323533490500624, + "grad_norm": 0.23463557660579681, + "learning_rate": 0.00018626644736842104, + "loss": 0.5723, + "step": 1193 + }, + { + "epoch": 1.3246429066703647, + "grad_norm": 0.23432673513889313, + "learning_rate": 0.00018614309210526314, + "loss": 0.2767, + "step": 1194 + }, + { + "epoch": 1.3257523228401054, + "grad_norm": 0.30433669686317444, + "learning_rate": 0.00018601973684210523, + "loss": 0.5931, + "step": 1195 + }, + { + "epoch": 1.326861739009846, + "grad_norm": 0.2979690432548523, + "learning_rate": 0.00018589638157894735, + "loss": 0.6678, + "step": 1196 + }, + { + "epoch": 1.3279711551795867, + "grad_norm": 0.279066801071167, + "learning_rate": 0.00018577302631578944, + "loss": 0.6302, + "step": 1197 + }, + { + "epoch": 1.3290805713493274, + "grad_norm": 0.28714966773986816, + "learning_rate": 0.00018564967105263156, + "loss": 0.494, + "step": 1198 + }, + { + "epoch": 1.3301899875190681, + "grad_norm": 0.21045270562171936, + "learning_rate": 0.00018552631578947368, + "loss": 0.5388, + "step": 1199 + }, + { + "epoch": 1.3312994036888088, + "grad_norm": 0.27514612674713135, + "learning_rate": 0.00018540296052631578, + "loss": 0.3849, + "step": 1200 + }, + { + "epoch": 1.3324088198585495, + "grad_norm": 0.3071988523006439, + "learning_rate": 0.00018527960526315787, + "loss": 0.4609, + "step": 1201 + }, + { + "epoch": 1.33351823602829, + "grad_norm": 0.251871794462204, + "learning_rate": 0.00018515625, + "loss": 0.7988, + "step": 1202 + }, + { + "epoch": 1.334627652198031, + "grad_norm": 0.273266077041626, + "learning_rate": 0.00018503289473684208, + "loss": 0.3886, + "step": 1203 + }, + { + "epoch": 1.3357370683677714, + "grad_norm": 0.28145653009414673, + "learning_rate": 0.00018490953947368418, + "loss": 0.556, + "step": 1204 + }, + { + "epoch": 1.336846484537512, + "grad_norm": 0.23886194825172424, + "learning_rate": 0.0001847861842105263, + "loss": 0.4633, + "step": 1205 + }, + { + "epoch": 1.3379559007072528, + "grad_norm": 0.2713840901851654, + "learning_rate": 0.0001846628289473684, + "loss": 0.6669, + "step": 1206 + }, + { + "epoch": 1.3390653168769935, + "grad_norm": 0.19628193974494934, + "learning_rate": 0.00018453947368421054, + "loss": 0.5268, + "step": 1207 + }, + { + "epoch": 1.3401747330467342, + "grad_norm": 0.32730063796043396, + "learning_rate": 0.00018441611842105263, + "loss": 0.4295, + "step": 1208 + }, + { + "epoch": 1.3412841492164749, + "grad_norm": 0.244260773062706, + "learning_rate": 0.00018429276315789472, + "loss": 0.4108, + "step": 1209 + }, + { + "epoch": 1.3423935653862156, + "grad_norm": 0.30566221475601196, + "learning_rate": 0.00018416940789473684, + "loss": 0.8457, + "step": 1210 + }, + { + "epoch": 1.343502981555956, + "grad_norm": 0.2607499659061432, + "learning_rate": 0.00018404605263157894, + "loss": 0.4235, + "step": 1211 + }, + { + "epoch": 1.344612397725697, + "grad_norm": 0.359625905752182, + "learning_rate": 0.00018392269736842103, + "loss": 0.5669, + "step": 1212 + }, + { + "epoch": 1.3457218138954374, + "grad_norm": 0.2476588487625122, + "learning_rate": 0.00018379934210526312, + "loss": 0.3539, + "step": 1213 + }, + { + "epoch": 1.3468312300651781, + "grad_norm": 0.20041054487228394, + "learning_rate": 0.00018367598684210524, + "loss": 0.3843, + "step": 1214 + }, + { + "epoch": 1.3479406462349188, + "grad_norm": 0.4561375677585602, + "learning_rate": 0.00018355263157894734, + "loss": 0.6868, + "step": 1215 + }, + { + "epoch": 1.3490500624046595, + "grad_norm": 0.24837137758731842, + "learning_rate": 0.00018342927631578943, + "loss": 0.4685, + "step": 1216 + }, + { + "epoch": 1.3501594785744002, + "grad_norm": 0.3139680027961731, + "learning_rate": 0.00018330592105263158, + "loss": 0.5152, + "step": 1217 + }, + { + "epoch": 1.351268894744141, + "grad_norm": 0.2819393277168274, + "learning_rate": 0.00018318256578947367, + "loss": 0.4204, + "step": 1218 + }, + { + "epoch": 1.3523783109138816, + "grad_norm": 0.2980377972126007, + "learning_rate": 0.0001830592105263158, + "loss": 0.4413, + "step": 1219 + }, + { + "epoch": 1.3534877270836223, + "grad_norm": 0.29253172874450684, + "learning_rate": 0.00018293585526315788, + "loss": 0.5169, + "step": 1220 + }, + { + "epoch": 1.354597143253363, + "grad_norm": 0.3686385154724121, + "learning_rate": 0.00018281249999999998, + "loss": 0.4958, + "step": 1221 + }, + { + "epoch": 1.3557065594231035, + "grad_norm": 0.29562124609947205, + "learning_rate": 0.0001826891447368421, + "loss": 0.5406, + "step": 1222 + }, + { + "epoch": 1.3568159755928444, + "grad_norm": 0.21728160977363586, + "learning_rate": 0.0001825657894736842, + "loss": 0.412, + "step": 1223 + }, + { + "epoch": 1.3579253917625849, + "grad_norm": 0.23864157497882843, + "learning_rate": 0.00018244243421052628, + "loss": 0.4752, + "step": 1224 + }, + { + "epoch": 1.3590348079323256, + "grad_norm": 0.24741685390472412, + "learning_rate": 0.00018231907894736838, + "loss": 0.5245, + "step": 1225 + }, + { + "epoch": 1.3601442241020663, + "grad_norm": 0.2409430295228958, + "learning_rate": 0.00018219572368421053, + "loss": 0.4216, + "step": 1226 + }, + { + "epoch": 1.361253640271807, + "grad_norm": 0.25885629653930664, + "learning_rate": 0.00018207236842105262, + "loss": 0.4052, + "step": 1227 + }, + { + "epoch": 1.3623630564415476, + "grad_norm": 0.28579777479171753, + "learning_rate": 0.00018194901315789474, + "loss": 0.4137, + "step": 1228 + }, + { + "epoch": 1.3634724726112883, + "grad_norm": 0.23780061304569244, + "learning_rate": 0.00018182565789473683, + "loss": 0.4971, + "step": 1229 + }, + { + "epoch": 1.364581888781029, + "grad_norm": 0.32252663373947144, + "learning_rate": 0.00018170230263157893, + "loss": 0.5721, + "step": 1230 + }, + { + "epoch": 1.3656913049507697, + "grad_norm": 0.3731588125228882, + "learning_rate": 0.00018157894736842105, + "loss": 0.4581, + "step": 1231 + }, + { + "epoch": 1.3668007211205104, + "grad_norm": 0.15041133761405945, + "learning_rate": 0.00018145559210526314, + "loss": 0.3275, + "step": 1232 + }, + { + "epoch": 1.367910137290251, + "grad_norm": 0.4989398419857025, + "learning_rate": 0.00018133223684210523, + "loss": 0.3791, + "step": 1233 + }, + { + "epoch": 1.3690195534599916, + "grad_norm": 0.4500264525413513, + "learning_rate": 0.00018120888157894735, + "loss": 0.4119, + "step": 1234 + }, + { + "epoch": 1.3701289696297323, + "grad_norm": 0.2674502432346344, + "learning_rate": 0.00018108552631578945, + "loss": 0.6938, + "step": 1235 + }, + { + "epoch": 1.371238385799473, + "grad_norm": 0.363046258687973, + "learning_rate": 0.00018096217105263157, + "loss": 0.4881, + "step": 1236 + }, + { + "epoch": 1.3723478019692137, + "grad_norm": 0.38243699073791504, + "learning_rate": 0.00018083881578947369, + "loss": 0.6211, + "step": 1237 + }, + { + "epoch": 1.3734572181389544, + "grad_norm": 0.25102144479751587, + "learning_rate": 0.00018071546052631578, + "loss": 0.4659, + "step": 1238 + }, + { + "epoch": 1.374566634308695, + "grad_norm": 0.546259343624115, + "learning_rate": 0.00018059210526315787, + "loss": 0.4187, + "step": 1239 + }, + { + "epoch": 1.3756760504784358, + "grad_norm": 0.3232324719429016, + "learning_rate": 0.00018046875, + "loss": 0.457, + "step": 1240 + }, + { + "epoch": 1.3767854666481765, + "grad_norm": 0.2647280693054199, + "learning_rate": 0.00018034539473684209, + "loss": 0.5801, + "step": 1241 + }, + { + "epoch": 1.377894882817917, + "grad_norm": 0.43696561455726624, + "learning_rate": 0.00018022203947368418, + "loss": 0.5091, + "step": 1242 + }, + { + "epoch": 1.3790042989876579, + "grad_norm": 0.25493887066841125, + "learning_rate": 0.0001800986842105263, + "loss": 0.4163, + "step": 1243 + }, + { + "epoch": 1.3801137151573983, + "grad_norm": 0.32590964436531067, + "learning_rate": 0.0001799753289473684, + "loss": 0.4611, + "step": 1244 + }, + { + "epoch": 1.381223131327139, + "grad_norm": 0.28104108572006226, + "learning_rate": 0.00017985197368421054, + "loss": 0.3906, + "step": 1245 + }, + { + "epoch": 1.3823325474968797, + "grad_norm": 0.3454284369945526, + "learning_rate": 0.00017972861842105263, + "loss": 0.5055, + "step": 1246 + }, + { + "epoch": 1.3834419636666204, + "grad_norm": 0.3004135489463806, + "learning_rate": 0.00017960526315789473, + "loss": 0.5906, + "step": 1247 + }, + { + "epoch": 1.3845513798363611, + "grad_norm": 0.3798172175884247, + "learning_rate": 0.00017948190789473682, + "loss": 0.3812, + "step": 1248 + }, + { + "epoch": 1.3856607960061018, + "grad_norm": 0.23392631113529205, + "learning_rate": 0.00017935855263157894, + "loss": 0.3541, + "step": 1249 + }, + { + "epoch": 1.3867702121758425, + "grad_norm": 0.28611382842063904, + "learning_rate": 0.00017923519736842103, + "loss": 0.3994, + "step": 1250 + }, + { + "epoch": 1.3878796283455832, + "grad_norm": 0.23118913173675537, + "learning_rate": 0.00017911184210526313, + "loss": 0.5034, + "step": 1251 + }, + { + "epoch": 1.388989044515324, + "grad_norm": 0.3240826725959778, + "learning_rate": 0.00017898848684210525, + "loss": 0.7616, + "step": 1252 + }, + { + "epoch": 1.3900984606850644, + "grad_norm": 0.35286498069763184, + "learning_rate": 0.00017886513157894734, + "loss": 0.7485, + "step": 1253 + }, + { + "epoch": 1.3912078768548053, + "grad_norm": 0.23060156404972076, + "learning_rate": 0.00017874177631578943, + "loss": 0.4549, + "step": 1254 + }, + { + "epoch": 1.3923172930245458, + "grad_norm": 0.3174906075000763, + "learning_rate": 0.00017861842105263158, + "loss": 0.4251, + "step": 1255 + }, + { + "epoch": 1.3934267091942865, + "grad_norm": 0.24730369448661804, + "learning_rate": 0.00017849506578947367, + "loss": 0.4357, + "step": 1256 + }, + { + "epoch": 1.3945361253640272, + "grad_norm": 0.2686748206615448, + "learning_rate": 0.0001783717105263158, + "loss": 0.3928, + "step": 1257 + }, + { + "epoch": 1.3956455415337679, + "grad_norm": 0.2948950231075287, + "learning_rate": 0.00017824835526315789, + "loss": 0.6208, + "step": 1258 + }, + { + "epoch": 1.3967549577035085, + "grad_norm": 0.30887314677238464, + "learning_rate": 0.00017812499999999998, + "loss": 0.4876, + "step": 1259 + }, + { + "epoch": 1.3978643738732492, + "grad_norm": 0.3009176552295685, + "learning_rate": 0.0001780016447368421, + "loss": 0.3689, + "step": 1260 + }, + { + "epoch": 1.39897379004299, + "grad_norm": 0.24447450041770935, + "learning_rate": 0.0001778782894736842, + "loss": 0.3824, + "step": 1261 + }, + { + "epoch": 1.4000832062127304, + "grad_norm": 0.2673259973526001, + "learning_rate": 0.00017775493421052629, + "loss": 0.5425, + "step": 1262 + }, + { + "epoch": 1.4011926223824713, + "grad_norm": 0.273612916469574, + "learning_rate": 0.00017763157894736838, + "loss": 0.3522, + "step": 1263 + }, + { + "epoch": 1.4023020385522118, + "grad_norm": 0.3372279405593872, + "learning_rate": 0.00017750822368421053, + "loss": 0.4351, + "step": 1264 + }, + { + "epoch": 1.4034114547219525, + "grad_norm": 0.23103949427604675, + "learning_rate": 0.00017738486842105262, + "loss": 0.4426, + "step": 1265 + }, + { + "epoch": 1.4045208708916932, + "grad_norm": 0.21246463060379028, + "learning_rate": 0.00017726151315789474, + "loss": 0.5233, + "step": 1266 + }, + { + "epoch": 1.405630287061434, + "grad_norm": 0.2304743230342865, + "learning_rate": 0.00017713815789473683, + "loss": 0.3163, + "step": 1267 + }, + { + "epoch": 1.4067397032311746, + "grad_norm": 0.35149502754211426, + "learning_rate": 0.00017701480263157893, + "loss": 0.6481, + "step": 1268 + }, + { + "epoch": 1.4078491194009153, + "grad_norm": 0.2509346604347229, + "learning_rate": 0.00017689144736842105, + "loss": 0.6448, + "step": 1269 + }, + { + "epoch": 1.408958535570656, + "grad_norm": 0.25470322370529175, + "learning_rate": 0.00017676809210526314, + "loss": 0.5486, + "step": 1270 + }, + { + "epoch": 1.4100679517403967, + "grad_norm": 0.24675153195858002, + "learning_rate": 0.00017664473684210523, + "loss": 0.4237, + "step": 1271 + }, + { + "epoch": 1.4111773679101374, + "grad_norm": 0.38935887813568115, + "learning_rate": 0.00017652138157894735, + "loss": 0.5769, + "step": 1272 + }, + { + "epoch": 1.4122867840798778, + "grad_norm": 0.22933362424373627, + "learning_rate": 0.00017639802631578945, + "loss": 0.4627, + "step": 1273 + }, + { + "epoch": 1.4133962002496188, + "grad_norm": 0.4441911280155182, + "learning_rate": 0.00017627467105263157, + "loss": 0.5397, + "step": 1274 + }, + { + "epoch": 1.4145056164193592, + "grad_norm": 0.24430730938911438, + "learning_rate": 0.0001761513157894737, + "loss": 0.4419, + "step": 1275 + }, + { + "epoch": 1.4156150325891, + "grad_norm": 0.280831515789032, + "learning_rate": 0.00017602796052631578, + "loss": 0.3993, + "step": 1276 + }, + { + "epoch": 1.4167244487588406, + "grad_norm": 0.32151225209236145, + "learning_rate": 0.00017590460526315787, + "loss": 0.5654, + "step": 1277 + }, + { + "epoch": 1.4178338649285813, + "grad_norm": 0.3048468828201294, + "learning_rate": 0.00017578125, + "loss": 0.4373, + "step": 1278 + }, + { + "epoch": 1.418943281098322, + "grad_norm": 0.44277939200401306, + "learning_rate": 0.0001756578947368421, + "loss": 0.4243, + "step": 1279 + }, + { + "epoch": 1.4200526972680627, + "grad_norm": 0.3722161054611206, + "learning_rate": 0.00017553453947368418, + "loss": 0.5524, + "step": 1280 + }, + { + "epoch": 1.4211621134378034, + "grad_norm": 0.24039240181446075, + "learning_rate": 0.0001754111842105263, + "loss": 0.4882, + "step": 1281 + }, + { + "epoch": 1.422271529607544, + "grad_norm": 0.34307026863098145, + "learning_rate": 0.0001752878289473684, + "loss": 0.5102, + "step": 1282 + }, + { + "epoch": 1.4233809457772848, + "grad_norm": 0.2590845227241516, + "learning_rate": 0.00017516447368421051, + "loss": 0.4354, + "step": 1283 + }, + { + "epoch": 1.4244903619470253, + "grad_norm": 0.25644242763519287, + "learning_rate": 0.00017504111842105263, + "loss": 0.4385, + "step": 1284 + }, + { + "epoch": 1.425599778116766, + "grad_norm": 0.25579833984375, + "learning_rate": 0.00017491776315789473, + "loss": 0.5522, + "step": 1285 + }, + { + "epoch": 1.4267091942865067, + "grad_norm": 0.26802870631217957, + "learning_rate": 0.00017479440789473682, + "loss": 0.369, + "step": 1286 + }, + { + "epoch": 1.4278186104562474, + "grad_norm": 0.28027182817459106, + "learning_rate": 0.00017467105263157894, + "loss": 0.485, + "step": 1287 + }, + { + "epoch": 1.428928026625988, + "grad_norm": 0.5787771344184875, + "learning_rate": 0.00017454769736842103, + "loss": 0.5496, + "step": 1288 + }, + { + "epoch": 1.4300374427957288, + "grad_norm": 0.24002310633659363, + "learning_rate": 0.00017442434210526313, + "loss": 0.4172, + "step": 1289 + }, + { + "epoch": 1.4311468589654694, + "grad_norm": 0.34913745522499084, + "learning_rate": 0.00017430098684210525, + "loss": 0.6889, + "step": 1290 + }, + { + "epoch": 1.4322562751352101, + "grad_norm": 0.2884618639945984, + "learning_rate": 0.00017417763157894734, + "loss": 0.4579, + "step": 1291 + }, + { + "epoch": 1.4333656913049508, + "grad_norm": 0.2817135751247406, + "learning_rate": 0.00017405427631578943, + "loss": 0.4721, + "step": 1292 + }, + { + "epoch": 1.4344751074746913, + "grad_norm": 0.2580573558807373, + "learning_rate": 0.00017393092105263158, + "loss": 0.4966, + "step": 1293 + }, + { + "epoch": 1.4355845236444322, + "grad_norm": 0.3415181338787079, + "learning_rate": 0.00017380756578947367, + "loss": 0.4475, + "step": 1294 + }, + { + "epoch": 1.4366939398141727, + "grad_norm": 0.2799202799797058, + "learning_rate": 0.0001736842105263158, + "loss": 0.4047, + "step": 1295 + }, + { + "epoch": 1.4378033559839134, + "grad_norm": 0.3765754699707031, + "learning_rate": 0.0001735608552631579, + "loss": 0.4161, + "step": 1296 + }, + { + "epoch": 1.438912772153654, + "grad_norm": 0.2828143835067749, + "learning_rate": 0.00017343749999999998, + "loss": 0.544, + "step": 1297 + }, + { + "epoch": 1.4400221883233948, + "grad_norm": 0.4222780466079712, + "learning_rate": 0.00017331414473684207, + "loss": 0.4053, + "step": 1298 + }, + { + "epoch": 1.4411316044931355, + "grad_norm": 0.2224740982055664, + "learning_rate": 0.0001731907894736842, + "loss": 0.343, + "step": 1299 + }, + { + "epoch": 1.4422410206628762, + "grad_norm": 0.301512748003006, + "learning_rate": 0.0001730674342105263, + "loss": 0.5385, + "step": 1300 + }, + { + "epoch": 1.4433504368326169, + "grad_norm": 0.24199537932872772, + "learning_rate": 0.00017294407894736838, + "loss": 0.4618, + "step": 1301 + }, + { + "epoch": 1.4444598530023576, + "grad_norm": 0.2662793695926666, + "learning_rate": 0.00017282072368421053, + "loss": 0.3893, + "step": 1302 + }, + { + "epoch": 1.4455692691720983, + "grad_norm": 0.6027406454086304, + "learning_rate": 0.00017269736842105262, + "loss": 0.7918, + "step": 1303 + }, + { + "epoch": 1.4466786853418387, + "grad_norm": 0.33173203468322754, + "learning_rate": 0.00017257401315789474, + "loss": 0.3696, + "step": 1304 + }, + { + "epoch": 1.4477881015115797, + "grad_norm": 0.42119306325912476, + "learning_rate": 0.00017245065789473683, + "loss": 0.3504, + "step": 1305 + }, + { + "epoch": 1.4488975176813201, + "grad_norm": 0.33368048071861267, + "learning_rate": 0.00017232730263157893, + "loss": 0.543, + "step": 1306 + }, + { + "epoch": 1.4500069338510608, + "grad_norm": 0.3724362850189209, + "learning_rate": 0.00017220394736842105, + "loss": 0.483, + "step": 1307 + }, + { + "epoch": 1.4511163500208015, + "grad_norm": 0.28803780674934387, + "learning_rate": 0.00017208059210526314, + "loss": 0.6044, + "step": 1308 + }, + { + "epoch": 1.4522257661905422, + "grad_norm": 0.29041630029678345, + "learning_rate": 0.00017195723684210523, + "loss": 0.4351, + "step": 1309 + }, + { + "epoch": 1.453335182360283, + "grad_norm": 0.27980196475982666, + "learning_rate": 0.00017183388157894735, + "loss": 0.2806, + "step": 1310 + }, + { + "epoch": 1.4544445985300236, + "grad_norm": 0.5144875049591064, + "learning_rate": 0.00017171052631578945, + "loss": 0.6004, + "step": 1311 + }, + { + "epoch": 1.4555540146997643, + "grad_norm": 0.41391658782958984, + "learning_rate": 0.00017158717105263157, + "loss": 0.3426, + "step": 1312 + }, + { + "epoch": 1.456663430869505, + "grad_norm": 0.38778960704803467, + "learning_rate": 0.0001714638157894737, + "loss": 0.5716, + "step": 1313 + }, + { + "epoch": 1.4577728470392457, + "grad_norm": 0.29660889506340027, + "learning_rate": 0.00017134046052631578, + "loss": 0.6084, + "step": 1314 + }, + { + "epoch": 1.4588822632089862, + "grad_norm": 0.24347136914730072, + "learning_rate": 0.00017121710526315787, + "loss": 0.6538, + "step": 1315 + }, + { + "epoch": 1.4599916793787269, + "grad_norm": 0.2733883857727051, + "learning_rate": 0.00017109375, + "loss": 0.5797, + "step": 1316 + }, + { + "epoch": 1.4611010955484676, + "grad_norm": 0.33641162514686584, + "learning_rate": 0.0001709703947368421, + "loss": 0.4837, + "step": 1317 + }, + { + "epoch": 1.4622105117182083, + "grad_norm": 0.24747195839881897, + "learning_rate": 0.00017084703947368418, + "loss": 0.4617, + "step": 1318 + }, + { + "epoch": 1.463319927887949, + "grad_norm": 0.2453019767999649, + "learning_rate": 0.0001707236842105263, + "loss": 0.5242, + "step": 1319 + }, + { + "epoch": 1.4644293440576897, + "grad_norm": 0.2680438160896301, + "learning_rate": 0.0001706003289473684, + "loss": 0.3985, + "step": 1320 + }, + { + "epoch": 1.4655387602274303, + "grad_norm": 0.22616985440254211, + "learning_rate": 0.00017047697368421051, + "loss": 0.5428, + "step": 1321 + }, + { + "epoch": 1.466648176397171, + "grad_norm": 0.25485649704933167, + "learning_rate": 0.00017035361842105264, + "loss": 0.4411, + "step": 1322 + }, + { + "epoch": 1.4677575925669117, + "grad_norm": 0.19676417112350464, + "learning_rate": 0.00017023026315789473, + "loss": 0.4987, + "step": 1323 + }, + { + "epoch": 1.4688670087366522, + "grad_norm": 0.23980014026165009, + "learning_rate": 0.00017010690789473682, + "loss": 0.2853, + "step": 1324 + }, + { + "epoch": 1.4699764249063931, + "grad_norm": 0.31723615527153015, + "learning_rate": 0.00016998355263157894, + "loss": 0.3271, + "step": 1325 + }, + { + "epoch": 1.4710858410761336, + "grad_norm": 0.2783150374889374, + "learning_rate": 0.00016986019736842103, + "loss": 0.4648, + "step": 1326 + }, + { + "epoch": 1.4721952572458743, + "grad_norm": 0.3233512341976166, + "learning_rate": 0.00016973684210526313, + "loss": 0.4557, + "step": 1327 + }, + { + "epoch": 1.473304673415615, + "grad_norm": 0.2481250911951065, + "learning_rate": 0.00016961348684210525, + "loss": 0.5224, + "step": 1328 + }, + { + "epoch": 1.4744140895853557, + "grad_norm": 0.317909300327301, + "learning_rate": 0.00016949013157894734, + "loss": 0.642, + "step": 1329 + }, + { + "epoch": 1.4755235057550964, + "grad_norm": 0.25988101959228516, + "learning_rate": 0.00016936677631578943, + "loss": 0.3956, + "step": 1330 + }, + { + "epoch": 1.476632921924837, + "grad_norm": 0.2583842873573303, + "learning_rate": 0.00016924342105263158, + "loss": 0.4031, + "step": 1331 + }, + { + "epoch": 1.4777423380945778, + "grad_norm": 0.2636638283729553, + "learning_rate": 0.00016912006578947368, + "loss": 0.4351, + "step": 1332 + }, + { + "epoch": 1.4788517542643185, + "grad_norm": 0.2658294439315796, + "learning_rate": 0.00016899671052631577, + "loss": 0.4895, + "step": 1333 + }, + { + "epoch": 1.4799611704340592, + "grad_norm": 0.192026287317276, + "learning_rate": 0.0001688733552631579, + "loss": 0.5236, + "step": 1334 + }, + { + "epoch": 1.4810705866037996, + "grad_norm": 0.2988239824771881, + "learning_rate": 0.00016874999999999998, + "loss": 0.4434, + "step": 1335 + }, + { + "epoch": 1.4821800027735406, + "grad_norm": 0.32316145300865173, + "learning_rate": 0.00016862664473684207, + "loss": 0.5385, + "step": 1336 + }, + { + "epoch": 1.483289418943281, + "grad_norm": 0.3037776052951813, + "learning_rate": 0.0001685032894736842, + "loss": 0.3864, + "step": 1337 + }, + { + "epoch": 1.4843988351130217, + "grad_norm": 0.3819033205509186, + "learning_rate": 0.0001683799342105263, + "loss": 0.4825, + "step": 1338 + }, + { + "epoch": 1.4855082512827624, + "grad_norm": 0.346760630607605, + "learning_rate": 0.00016825657894736838, + "loss": 0.5131, + "step": 1339 + }, + { + "epoch": 1.4866176674525031, + "grad_norm": 0.27673614025115967, + "learning_rate": 0.00016813322368421053, + "loss": 0.4316, + "step": 1340 + }, + { + "epoch": 1.4877270836222438, + "grad_norm": 0.21999450027942657, + "learning_rate": 0.00016800986842105262, + "loss": 0.4106, + "step": 1341 + }, + { + "epoch": 1.4888364997919845, + "grad_norm": 0.30723118782043457, + "learning_rate": 0.00016788651315789474, + "loss": 0.5362, + "step": 1342 + }, + { + "epoch": 1.4899459159617252, + "grad_norm": 0.4165399372577667, + "learning_rate": 0.00016776315789473684, + "loss": 0.6185, + "step": 1343 + }, + { + "epoch": 1.4910553321314657, + "grad_norm": 0.2928377091884613, + "learning_rate": 0.00016763980263157893, + "loss": 0.4006, + "step": 1344 + }, + { + "epoch": 1.4921647483012066, + "grad_norm": 0.405435711145401, + "learning_rate": 0.00016751644736842105, + "loss": 0.4493, + "step": 1345 + }, + { + "epoch": 1.493274164470947, + "grad_norm": 0.2776191830635071, + "learning_rate": 0.00016739309210526314, + "loss": 0.4531, + "step": 1346 + }, + { + "epoch": 1.4943835806406878, + "grad_norm": 0.19967693090438843, + "learning_rate": 0.00016726973684210524, + "loss": 0.5397, + "step": 1347 + }, + { + "epoch": 1.4954929968104285, + "grad_norm": 0.22307896614074707, + "learning_rate": 0.00016714638157894733, + "loss": 0.6485, + "step": 1348 + }, + { + "epoch": 1.4966024129801692, + "grad_norm": 0.34726205468177795, + "learning_rate": 0.00016702302631578945, + "loss": 0.7418, + "step": 1349 + }, + { + "epoch": 1.4977118291499099, + "grad_norm": 0.2557240128517151, + "learning_rate": 0.00016689967105263157, + "loss": 0.2897, + "step": 1350 + }, + { + "epoch": 1.4988212453196506, + "grad_norm": 0.2983255386352539, + "learning_rate": 0.0001667763157894737, + "loss": 0.4885, + "step": 1351 + }, + { + "epoch": 1.4999306614893912, + "grad_norm": 0.2455969750881195, + "learning_rate": 0.00016665296052631578, + "loss": 0.5027, + "step": 1352 + }, + { + "epoch": 1.5010400776591317, + "grad_norm": 0.2705221474170685, + "learning_rate": 0.00016652960526315788, + "loss": 0.5434, + "step": 1353 + }, + { + "epoch": 1.5021494938288726, + "grad_norm": 0.27649638056755066, + "learning_rate": 0.00016640625, + "loss": 0.5844, + "step": 1354 + }, + { + "epoch": 1.5032589099986131, + "grad_norm": 0.28695183992385864, + "learning_rate": 0.0001662828947368421, + "loss": 0.4698, + "step": 1355 + }, + { + "epoch": 1.504368326168354, + "grad_norm": 0.3684578537940979, + "learning_rate": 0.00016615953947368418, + "loss": 0.5555, + "step": 1356 + }, + { + "epoch": 1.5054777423380945, + "grad_norm": 0.1808445006608963, + "learning_rate": 0.0001660361842105263, + "loss": 0.4829, + "step": 1357 + }, + { + "epoch": 1.5065871585078352, + "grad_norm": 0.37919580936431885, + "learning_rate": 0.0001659128289473684, + "loss": 0.4, + "step": 1358 + }, + { + "epoch": 1.507696574677576, + "grad_norm": 0.3226320743560791, + "learning_rate": 0.00016578947368421052, + "loss": 0.6081, + "step": 1359 + }, + { + "epoch": 1.5088059908473166, + "grad_norm": 0.378694623708725, + "learning_rate": 0.00016566611842105264, + "loss": 0.5211, + "step": 1360 + }, + { + "epoch": 1.5099154070170573, + "grad_norm": 0.418722003698349, + "learning_rate": 0.00016554276315789473, + "loss": 0.67, + "step": 1361 + }, + { + "epoch": 1.511024823186798, + "grad_norm": 0.3537873327732086, + "learning_rate": 0.00016541940789473682, + "loss": 0.6805, + "step": 1362 + }, + { + "epoch": 1.5121342393565387, + "grad_norm": 0.2202320247888565, + "learning_rate": 0.00016529605263157894, + "loss": 0.3873, + "step": 1363 + }, + { + "epoch": 1.5132436555262792, + "grad_norm": 0.36664825677871704, + "learning_rate": 0.00016517269736842104, + "loss": 0.4605, + "step": 1364 + }, + { + "epoch": 1.51435307169602, + "grad_norm": 0.25324442982673645, + "learning_rate": 0.00016504934210526313, + "loss": 0.6109, + "step": 1365 + }, + { + "epoch": 1.5154624878657605, + "grad_norm": 0.19155505299568176, + "learning_rate": 0.00016492598684210525, + "loss": 0.7548, + "step": 1366 + }, + { + "epoch": 1.5165719040355015, + "grad_norm": 0.3349843919277191, + "learning_rate": 0.00016480263157894734, + "loss": 0.3317, + "step": 1367 + }, + { + "epoch": 1.517681320205242, + "grad_norm": 0.26652729511260986, + "learning_rate": 0.00016467927631578944, + "loss": 0.6128, + "step": 1368 + }, + { + "epoch": 1.5187907363749826, + "grad_norm": 0.30250903964042664, + "learning_rate": 0.00016455592105263158, + "loss": 0.6961, + "step": 1369 + }, + { + "epoch": 1.5199001525447233, + "grad_norm": 0.4225883185863495, + "learning_rate": 0.00016443256578947368, + "loss": 0.5264, + "step": 1370 + }, + { + "epoch": 1.521009568714464, + "grad_norm": 0.331551730632782, + "learning_rate": 0.00016430921052631577, + "loss": 0.4245, + "step": 1371 + }, + { + "epoch": 1.5221189848842047, + "grad_norm": 0.2630516588687897, + "learning_rate": 0.0001641858552631579, + "loss": 0.4204, + "step": 1372 + }, + { + "epoch": 1.5232284010539454, + "grad_norm": 0.2230089157819748, + "learning_rate": 0.00016406249999999998, + "loss": 0.4686, + "step": 1373 + }, + { + "epoch": 1.524337817223686, + "grad_norm": 0.33370235562324524, + "learning_rate": 0.00016393914473684208, + "loss": 0.69, + "step": 1374 + }, + { + "epoch": 1.5254472333934266, + "grad_norm": 0.3383263349533081, + "learning_rate": 0.0001638157894736842, + "loss": 0.5046, + "step": 1375 + }, + { + "epoch": 1.5265566495631675, + "grad_norm": 0.3292944133281708, + "learning_rate": 0.0001636924342105263, + "loss": 0.5178, + "step": 1376 + }, + { + "epoch": 1.527666065732908, + "grad_norm": 0.2474631369113922, + "learning_rate": 0.00016356907894736838, + "loss": 0.3783, + "step": 1377 + }, + { + "epoch": 1.528775481902649, + "grad_norm": 0.306476354598999, + "learning_rate": 0.00016344572368421053, + "loss": 0.3773, + "step": 1378 + }, + { + "epoch": 1.5298848980723894, + "grad_norm": 0.2079583704471588, + "learning_rate": 0.00016332236842105262, + "loss": 0.4554, + "step": 1379 + }, + { + "epoch": 1.53099431424213, + "grad_norm": 0.4730626940727234, + "learning_rate": 0.00016319901315789474, + "loss": 0.3571, + "step": 1380 + }, + { + "epoch": 1.5321037304118708, + "grad_norm": 0.22660337388515472, + "learning_rate": 0.00016307565789473684, + "loss": 0.4086, + "step": 1381 + }, + { + "epoch": 1.5332131465816115, + "grad_norm": 0.444742888212204, + "learning_rate": 0.00016295230263157893, + "loss": 0.4406, + "step": 1382 + }, + { + "epoch": 1.5343225627513521, + "grad_norm": 0.34402474761009216, + "learning_rate": 0.00016282894736842102, + "loss": 0.6871, + "step": 1383 + }, + { + "epoch": 1.5354319789210926, + "grad_norm": 0.31494930386543274, + "learning_rate": 0.00016270559210526314, + "loss": 0.5023, + "step": 1384 + }, + { + "epoch": 1.5365413950908335, + "grad_norm": 0.24129949510097504, + "learning_rate": 0.00016258223684210524, + "loss": 0.3006, + "step": 1385 + }, + { + "epoch": 1.537650811260574, + "grad_norm": 0.37455177307128906, + "learning_rate": 0.00016245888157894733, + "loss": 0.3357, + "step": 1386 + }, + { + "epoch": 1.538760227430315, + "grad_norm": 0.22518782317638397, + "learning_rate": 0.00016233552631578945, + "loss": 0.4212, + "step": 1387 + }, + { + "epoch": 1.5398696436000554, + "grad_norm": 0.3566707372665405, + "learning_rate": 0.00016221217105263157, + "loss": 0.432, + "step": 1388 + }, + { + "epoch": 1.540979059769796, + "grad_norm": 0.3053068518638611, + "learning_rate": 0.0001620888157894737, + "loss": 0.3547, + "step": 1389 + }, + { + "epoch": 1.5420884759395368, + "grad_norm": 0.26762375235557556, + "learning_rate": 0.00016196546052631578, + "loss": 0.4258, + "step": 1390 + }, + { + "epoch": 1.5431978921092775, + "grad_norm": 0.44275879859924316, + "learning_rate": 0.00016184210526315788, + "loss": 0.5698, + "step": 1391 + }, + { + "epoch": 1.5443073082790182, + "grad_norm": 0.3341034948825836, + "learning_rate": 0.00016171875, + "loss": 0.6197, + "step": 1392 + }, + { + "epoch": 1.5454167244487589, + "grad_norm": 0.21536946296691895, + "learning_rate": 0.0001615953947368421, + "loss": 0.586, + "step": 1393 + }, + { + "epoch": 1.5465261406184996, + "grad_norm": 0.518312394618988, + "learning_rate": 0.00016147203947368418, + "loss": 0.663, + "step": 1394 + }, + { + "epoch": 1.54763555678824, + "grad_norm": 0.31936115026474, + "learning_rate": 0.0001613486842105263, + "loss": 0.5038, + "step": 1395 + }, + { + "epoch": 1.548744972957981, + "grad_norm": 0.4910357892513275, + "learning_rate": 0.0001612253289473684, + "loss": 0.5321, + "step": 1396 + }, + { + "epoch": 1.5498543891277214, + "grad_norm": 0.35702967643737793, + "learning_rate": 0.00016110197368421052, + "loss": 0.5254, + "step": 1397 + }, + { + "epoch": 1.5509638052974624, + "grad_norm": 0.46834543347358704, + "learning_rate": 0.00016097861842105264, + "loss": 0.5789, + "step": 1398 + }, + { + "epoch": 1.5520732214672028, + "grad_norm": 0.48472335934638977, + "learning_rate": 0.00016085526315789473, + "loss": 0.4901, + "step": 1399 + }, + { + "epoch": 1.5531826376369435, + "grad_norm": 0.28571802377700806, + "learning_rate": 0.00016073190789473682, + "loss": 0.458, + "step": 1400 + }, + { + "epoch": 1.5542920538066842, + "grad_norm": 0.30422210693359375, + "learning_rate": 0.00016060855263157894, + "loss": 0.4369, + "step": 1401 + }, + { + "epoch": 1.555401469976425, + "grad_norm": 0.34759703278541565, + "learning_rate": 0.00016048519736842104, + "loss": 0.5257, + "step": 1402 + }, + { + "epoch": 1.5565108861461656, + "grad_norm": 0.305867999792099, + "learning_rate": 0.00016036184210526313, + "loss": 0.5571, + "step": 1403 + }, + { + "epoch": 1.557620302315906, + "grad_norm": 0.2919771075248718, + "learning_rate": 0.00016023848684210525, + "loss": 0.6567, + "step": 1404 + }, + { + "epoch": 1.558729718485647, + "grad_norm": 0.24828073382377625, + "learning_rate": 0.00016011513157894734, + "loss": 0.409, + "step": 1405 + }, + { + "epoch": 1.5598391346553875, + "grad_norm": 0.48059597611427307, + "learning_rate": 0.00015999177631578944, + "loss": 0.5028, + "step": 1406 + }, + { + "epoch": 1.5609485508251284, + "grad_norm": 0.28389695286750793, + "learning_rate": 0.00015986842105263158, + "loss": 0.369, + "step": 1407 + }, + { + "epoch": 1.5620579669948689, + "grad_norm": 0.505401074886322, + "learning_rate": 0.00015974506578947368, + "loss": 0.6917, + "step": 1408 + }, + { + "epoch": 1.5631673831646096, + "grad_norm": 0.24662017822265625, + "learning_rate": 0.00015962171052631577, + "loss": 0.3777, + "step": 1409 + }, + { + "epoch": 1.5642767993343503, + "grad_norm": 0.38750240206718445, + "learning_rate": 0.0001594983552631579, + "loss": 0.6647, + "step": 1410 + }, + { + "epoch": 1.565386215504091, + "grad_norm": 0.41798150539398193, + "learning_rate": 0.00015937499999999998, + "loss": 0.5661, + "step": 1411 + }, + { + "epoch": 1.5664956316738317, + "grad_norm": 0.24084101617336273, + "learning_rate": 0.00015925164473684208, + "loss": 0.5066, + "step": 1412 + }, + { + "epoch": 1.5676050478435724, + "grad_norm": 0.3744387924671173, + "learning_rate": 0.0001591282894736842, + "loss": 0.4722, + "step": 1413 + }, + { + "epoch": 1.568714464013313, + "grad_norm": 0.2724044919013977, + "learning_rate": 0.0001590049342105263, + "loss": 0.5259, + "step": 1414 + }, + { + "epoch": 1.5698238801830535, + "grad_norm": 0.2745903432369232, + "learning_rate": 0.00015888157894736838, + "loss": 0.4011, + "step": 1415 + }, + { + "epoch": 1.5709332963527944, + "grad_norm": 0.20273329317569733, + "learning_rate": 0.00015875822368421053, + "loss": 0.5408, + "step": 1416 + }, + { + "epoch": 1.572042712522535, + "grad_norm": 0.38695916533470154, + "learning_rate": 0.00015863486842105262, + "loss": 0.6258, + "step": 1417 + }, + { + "epoch": 1.5731521286922758, + "grad_norm": 0.26426389813423157, + "learning_rate": 0.00015851151315789474, + "loss": 0.4185, + "step": 1418 + }, + { + "epoch": 1.5742615448620163, + "grad_norm": 0.2685663104057312, + "learning_rate": 0.00015838815789473684, + "loss": 1.0182, + "step": 1419 + }, + { + "epoch": 1.575370961031757, + "grad_norm": 0.3128146231174469, + "learning_rate": 0.00015826480263157893, + "loss": 0.4687, + "step": 1420 + }, + { + "epoch": 1.5764803772014977, + "grad_norm": 0.24564802646636963, + "learning_rate": 0.00015814144736842102, + "loss": 0.4918, + "step": 1421 + }, + { + "epoch": 1.5775897933712384, + "grad_norm": 0.3619256913661957, + "learning_rate": 0.00015801809210526314, + "loss": 0.4946, + "step": 1422 + }, + { + "epoch": 1.578699209540979, + "grad_norm": 0.4339245557785034, + "learning_rate": 0.00015789473684210524, + "loss": 0.5112, + "step": 1423 + }, + { + "epoch": 1.5798086257107198, + "grad_norm": 0.18522101640701294, + "learning_rate": 0.00015777138157894733, + "loss": 0.3891, + "step": 1424 + }, + { + "epoch": 1.5809180418804605, + "grad_norm": 0.23903268575668335, + "learning_rate": 0.00015764802631578945, + "loss": 0.3706, + "step": 1425 + }, + { + "epoch": 1.582027458050201, + "grad_norm": 0.6427960991859436, + "learning_rate": 0.00015752467105263157, + "loss": 0.6339, + "step": 1426 + }, + { + "epoch": 1.5831368742199419, + "grad_norm": 0.2820015847682953, + "learning_rate": 0.0001574013157894737, + "loss": 0.4439, + "step": 1427 + }, + { + "epoch": 1.5842462903896823, + "grad_norm": 0.26673081517219543, + "learning_rate": 0.00015727796052631579, + "loss": 0.5116, + "step": 1428 + }, + { + "epoch": 1.5853557065594233, + "grad_norm": 0.36776989698410034, + "learning_rate": 0.00015715460526315788, + "loss": 0.4931, + "step": 1429 + }, + { + "epoch": 1.5864651227291637, + "grad_norm": 0.3894679844379425, + "learning_rate": 0.00015703125, + "loss": 0.478, + "step": 1430 + }, + { + "epoch": 1.5875745388989044, + "grad_norm": 0.27240705490112305, + "learning_rate": 0.0001569078947368421, + "loss": 0.3737, + "step": 1431 + }, + { + "epoch": 1.5886839550686451, + "grad_norm": 0.32550275325775146, + "learning_rate": 0.00015678453947368418, + "loss": 0.5774, + "step": 1432 + }, + { + "epoch": 1.5897933712383858, + "grad_norm": 0.32884177565574646, + "learning_rate": 0.00015666118421052628, + "loss": 0.5408, + "step": 1433 + }, + { + "epoch": 1.5909027874081265, + "grad_norm": 0.45859280228614807, + "learning_rate": 0.0001565378289473684, + "loss": 0.7546, + "step": 1434 + }, + { + "epoch": 1.592012203577867, + "grad_norm": 0.29456886649131775, + "learning_rate": 0.00015641447368421052, + "loss": 0.3665, + "step": 1435 + }, + { + "epoch": 1.593121619747608, + "grad_norm": 0.22899344563484192, + "learning_rate": 0.00015629111842105264, + "loss": 0.4733, + "step": 1436 + }, + { + "epoch": 1.5942310359173484, + "grad_norm": 0.3687454164028168, + "learning_rate": 0.00015616776315789473, + "loss": 0.4508, + "step": 1437 + }, + { + "epoch": 1.5953404520870893, + "grad_norm": 0.2763974964618683, + "learning_rate": 0.00015604440789473683, + "loss": 0.5613, + "step": 1438 + }, + { + "epoch": 1.5964498682568298, + "grad_norm": 0.4642561376094818, + "learning_rate": 0.00015592105263157895, + "loss": 0.5261, + "step": 1439 + }, + { + "epoch": 1.5975592844265705, + "grad_norm": 0.46116307377815247, + "learning_rate": 0.00015579769736842104, + "loss": 0.5183, + "step": 1440 + }, + { + "epoch": 1.5986687005963112, + "grad_norm": 0.42349570989608765, + "learning_rate": 0.00015567434210526313, + "loss": 0.4358, + "step": 1441 + }, + { + "epoch": 1.5997781167660519, + "grad_norm": 0.5927262902259827, + "learning_rate": 0.00015555098684210525, + "loss": 0.4545, + "step": 1442 + }, + { + "epoch": 1.6008875329357926, + "grad_norm": 0.2763030230998993, + "learning_rate": 0.00015542763157894735, + "loss": 0.598, + "step": 1443 + }, + { + "epoch": 1.6019969491055333, + "grad_norm": 0.2903679311275482, + "learning_rate": 0.00015530427631578944, + "loss": 0.4782, + "step": 1444 + }, + { + "epoch": 1.603106365275274, + "grad_norm": 0.31723347306251526, + "learning_rate": 0.00015518092105263159, + "loss": 0.4793, + "step": 1445 + }, + { + "epoch": 1.6042157814450144, + "grad_norm": 0.3059498965740204, + "learning_rate": 0.00015505756578947368, + "loss": 0.5086, + "step": 1446 + }, + { + "epoch": 1.6053251976147553, + "grad_norm": 0.5435683131217957, + "learning_rate": 0.00015493421052631577, + "loss": 0.5502, + "step": 1447 + }, + { + "epoch": 1.6064346137844958, + "grad_norm": 0.28451740741729736, + "learning_rate": 0.0001548108552631579, + "loss": 0.3415, + "step": 1448 + }, + { + "epoch": 1.6075440299542367, + "grad_norm": 0.27565130591392517, + "learning_rate": 0.00015468749999999999, + "loss": 0.4961, + "step": 1449 + }, + { + "epoch": 1.6086534461239772, + "grad_norm": 0.38885173201560974, + "learning_rate": 0.00015456414473684208, + "loss": 0.6244, + "step": 1450 + }, + { + "epoch": 1.609762862293718, + "grad_norm": 0.3009326457977295, + "learning_rate": 0.0001544407894736842, + "loss": 0.3191, + "step": 1451 + }, + { + "epoch": 1.6108722784634586, + "grad_norm": 0.2374914437532425, + "learning_rate": 0.0001543174342105263, + "loss": 0.4029, + "step": 1452 + }, + { + "epoch": 1.6119816946331993, + "grad_norm": 0.24939504265785217, + "learning_rate": 0.00015419407894736839, + "loss": 0.6444, + "step": 1453 + }, + { + "epoch": 1.61309111080294, + "grad_norm": 0.2710380554199219, + "learning_rate": 0.00015407072368421053, + "loss": 0.5412, + "step": 1454 + }, + { + "epoch": 1.6142005269726805, + "grad_norm": 0.32970866560935974, + "learning_rate": 0.00015394736842105263, + "loss": 0.4512, + "step": 1455 + }, + { + "epoch": 1.6153099431424214, + "grad_norm": 0.4523448944091797, + "learning_rate": 0.00015382401315789472, + "loss": 0.5883, + "step": 1456 + }, + { + "epoch": 1.6164193593121619, + "grad_norm": 0.25891023874282837, + "learning_rate": 0.00015370065789473684, + "loss": 0.5516, + "step": 1457 + }, + { + "epoch": 1.6175287754819028, + "grad_norm": 0.37495937943458557, + "learning_rate": 0.00015357730263157893, + "loss": 0.5841, + "step": 1458 + }, + { + "epoch": 1.6186381916516432, + "grad_norm": 0.33875572681427, + "learning_rate": 0.00015345394736842103, + "loss": 0.6034, + "step": 1459 + }, + { + "epoch": 1.6197476078213842, + "grad_norm": 0.2283666431903839, + "learning_rate": 0.00015333059210526315, + "loss": 0.3142, + "step": 1460 + }, + { + "epoch": 1.6208570239911246, + "grad_norm": 0.35166046023368835, + "learning_rate": 0.00015320723684210524, + "loss": 0.5688, + "step": 1461 + }, + { + "epoch": 1.6219664401608653, + "grad_norm": 0.3472052812576294, + "learning_rate": 0.00015308388157894733, + "loss": 0.2508, + "step": 1462 + }, + { + "epoch": 1.623075856330606, + "grad_norm": 0.30597633123397827, + "learning_rate": 0.00015296052631578945, + "loss": 0.5829, + "step": 1463 + }, + { + "epoch": 1.6241852725003467, + "grad_norm": 0.7042750716209412, + "learning_rate": 0.00015283717105263157, + "loss": 0.5834, + "step": 1464 + }, + { + "epoch": 1.6252946886700874, + "grad_norm": 0.3710060119628906, + "learning_rate": 0.0001527138157894737, + "loss": 0.4427, + "step": 1465 + }, + { + "epoch": 1.626404104839828, + "grad_norm": 0.3646557927131653, + "learning_rate": 0.00015259046052631579, + "loss": 0.5418, + "step": 1466 + }, + { + "epoch": 1.6275135210095688, + "grad_norm": 0.357334703207016, + "learning_rate": 0.00015246710526315788, + "loss": 0.5551, + "step": 1467 + }, + { + "epoch": 1.6286229371793093, + "grad_norm": 0.2936727702617645, + "learning_rate": 0.00015234375, + "loss": 0.5441, + "step": 1468 + }, + { + "epoch": 1.6297323533490502, + "grad_norm": 0.24167072772979736, + "learning_rate": 0.0001522203947368421, + "loss": 0.6756, + "step": 1469 + }, + { + "epoch": 1.6308417695187907, + "grad_norm": 0.3860384523868561, + "learning_rate": 0.00015209703947368419, + "loss": 0.6466, + "step": 1470 + }, + { + "epoch": 1.6319511856885314, + "grad_norm": 0.20689241588115692, + "learning_rate": 0.00015197368421052628, + "loss": 0.28, + "step": 1471 + }, + { + "epoch": 1.633060601858272, + "grad_norm": 0.2413812130689621, + "learning_rate": 0.0001518503289473684, + "loss": 0.5198, + "step": 1472 + }, + { + "epoch": 1.6341700180280128, + "grad_norm": 0.2979881167411804, + "learning_rate": 0.00015172697368421052, + "loss": 0.5016, + "step": 1473 + }, + { + "epoch": 1.6352794341977535, + "grad_norm": 0.22364138066768646, + "learning_rate": 0.00015160361842105264, + "loss": 0.3996, + "step": 1474 + }, + { + "epoch": 1.6363888503674942, + "grad_norm": 0.27691081166267395, + "learning_rate": 0.00015148026315789473, + "loss": 0.4038, + "step": 1475 + }, + { + "epoch": 1.6374982665372348, + "grad_norm": 0.300599604845047, + "learning_rate": 0.00015135690789473683, + "loss": 0.4522, + "step": 1476 + }, + { + "epoch": 1.6386076827069753, + "grad_norm": 0.2906956672668457, + "learning_rate": 0.00015123355263157895, + "loss": 0.5978, + "step": 1477 + }, + { + "epoch": 1.6397170988767162, + "grad_norm": 0.3278878629207611, + "learning_rate": 0.00015111019736842104, + "loss": 0.445, + "step": 1478 + }, + { + "epoch": 1.6408265150464567, + "grad_norm": 0.25733959674835205, + "learning_rate": 0.00015098684210526313, + "loss": 0.4896, + "step": 1479 + }, + { + "epoch": 1.6419359312161976, + "grad_norm": 0.33139604330062866, + "learning_rate": 0.00015086348684210525, + "loss": 0.54, + "step": 1480 + }, + { + "epoch": 1.643045347385938, + "grad_norm": 0.6023613810539246, + "learning_rate": 0.00015074013157894735, + "loss": 0.6114, + "step": 1481 + }, + { + "epoch": 1.6441547635556788, + "grad_norm": 0.22838066518306732, + "learning_rate": 0.00015061677631578944, + "loss": 0.426, + "step": 1482 + }, + { + "epoch": 1.6452641797254195, + "grad_norm": 0.26457536220550537, + "learning_rate": 0.0001504934210526316, + "loss": 0.4963, + "step": 1483 + }, + { + "epoch": 1.6463735958951602, + "grad_norm": 0.30254966020584106, + "learning_rate": 0.00015037006578947368, + "loss": 0.5112, + "step": 1484 + }, + { + "epoch": 1.6474830120649009, + "grad_norm": 0.23207102715969086, + "learning_rate": 0.00015024671052631577, + "loss": 0.4926, + "step": 1485 + }, + { + "epoch": 1.6485924282346414, + "grad_norm": 0.25486233830451965, + "learning_rate": 0.0001501233552631579, + "loss": 0.4387, + "step": 1486 + }, + { + "epoch": 1.6497018444043823, + "grad_norm": 0.3489553928375244, + "learning_rate": 0.00015, + "loss": 0.6869, + "step": 1487 + }, + { + "epoch": 1.6508112605741228, + "grad_norm": 0.28267863392829895, + "learning_rate": 0.00014987664473684208, + "loss": 0.4191, + "step": 1488 + }, + { + "epoch": 1.6519206767438637, + "grad_norm": 0.2585887014865875, + "learning_rate": 0.0001497532894736842, + "loss": 0.5599, + "step": 1489 + }, + { + "epoch": 1.6530300929136041, + "grad_norm": 0.27951672673225403, + "learning_rate": 0.00014962993421052632, + "loss": 0.6091, + "step": 1490 + }, + { + "epoch": 1.6541395090833448, + "grad_norm": 0.33680734038352966, + "learning_rate": 0.00014950657894736841, + "loss": 0.5986, + "step": 1491 + }, + { + "epoch": 1.6552489252530855, + "grad_norm": 0.33777835965156555, + "learning_rate": 0.0001493832236842105, + "loss": 0.6051, + "step": 1492 + }, + { + "epoch": 1.6563583414228262, + "grad_norm": 0.29178673028945923, + "learning_rate": 0.00014925986842105263, + "loss": 0.3462, + "step": 1493 + }, + { + "epoch": 1.657467757592567, + "grad_norm": 0.23857684433460236, + "learning_rate": 0.00014913651315789472, + "loss": 0.3521, + "step": 1494 + }, + { + "epoch": 1.6585771737623076, + "grad_norm": 0.33080053329467773, + "learning_rate": 0.00014901315789473684, + "loss": 0.4185, + "step": 1495 + }, + { + "epoch": 1.6596865899320483, + "grad_norm": 0.35032498836517334, + "learning_rate": 0.00014888980263157893, + "loss": 0.6305, + "step": 1496 + }, + { + "epoch": 1.6607960061017888, + "grad_norm": 0.2609749138355255, + "learning_rate": 0.00014876644736842103, + "loss": 0.4313, + "step": 1497 + }, + { + "epoch": 1.6619054222715297, + "grad_norm": 0.3178192675113678, + "learning_rate": 0.00014864309210526315, + "loss": 0.6381, + "step": 1498 + }, + { + "epoch": 1.6630148384412702, + "grad_norm": 0.3444008231163025, + "learning_rate": 0.00014851973684210527, + "loss": 0.6142, + "step": 1499 + }, + { + "epoch": 1.664124254611011, + "grad_norm": 0.24173513054847717, + "learning_rate": 0.00014839638157894736, + "loss": 0.5568, + "step": 1500 + }, + { + "epoch": 1.6652336707807516, + "grad_norm": 0.23622675240039825, + "learning_rate": 0.00014827302631578945, + "loss": 0.4888, + "step": 1501 + }, + { + "epoch": 1.6663430869504923, + "grad_norm": 0.24951039254665375, + "learning_rate": 0.00014814967105263157, + "loss": 0.5554, + "step": 1502 + }, + { + "epoch": 1.667452503120233, + "grad_norm": 0.20648309588432312, + "learning_rate": 0.00014802631578947367, + "loss": 0.3302, + "step": 1503 + }, + { + "epoch": 1.6685619192899737, + "grad_norm": 0.2786915898323059, + "learning_rate": 0.0001479029605263158, + "loss": 0.3773, + "step": 1504 + }, + { + "epoch": 1.6696713354597144, + "grad_norm": 0.35018453001976013, + "learning_rate": 0.00014777960526315788, + "loss": 0.5903, + "step": 1505 + }, + { + "epoch": 1.670780751629455, + "grad_norm": 0.34121614694595337, + "learning_rate": 0.00014765624999999997, + "loss": 0.4813, + "step": 1506 + }, + { + "epoch": 1.6718901677991957, + "grad_norm": 0.41687941551208496, + "learning_rate": 0.0001475328947368421, + "loss": 0.7389, + "step": 1507 + }, + { + "epoch": 1.6729995839689362, + "grad_norm": 0.27919813990592957, + "learning_rate": 0.0001474095394736842, + "loss": 0.3984, + "step": 1508 + }, + { + "epoch": 1.6741090001386771, + "grad_norm": 0.2847552001476288, + "learning_rate": 0.0001472861842105263, + "loss": 0.3091, + "step": 1509 + }, + { + "epoch": 1.6752184163084176, + "grad_norm": 0.3107469379901886, + "learning_rate": 0.0001471628289473684, + "loss": 0.4399, + "step": 1510 + }, + { + "epoch": 1.6763278324781585, + "grad_norm": 0.21151646971702576, + "learning_rate": 0.00014703947368421052, + "loss": 0.3691, + "step": 1511 + }, + { + "epoch": 1.677437248647899, + "grad_norm": 0.27195483446121216, + "learning_rate": 0.00014691611842105261, + "loss": 0.5531, + "step": 1512 + }, + { + "epoch": 1.6785466648176397, + "grad_norm": 0.2964016795158386, + "learning_rate": 0.0001467927631578947, + "loss": 0.4978, + "step": 1513 + }, + { + "epoch": 1.6796560809873804, + "grad_norm": 0.39583608508110046, + "learning_rate": 0.00014666940789473683, + "loss": 0.6652, + "step": 1514 + }, + { + "epoch": 1.680765497157121, + "grad_norm": 0.23227983713150024, + "learning_rate": 0.00014654605263157895, + "loss": 0.5401, + "step": 1515 + }, + { + "epoch": 1.6818749133268618, + "grad_norm": 0.2583910822868347, + "learning_rate": 0.00014642269736842104, + "loss": 0.5246, + "step": 1516 + }, + { + "epoch": 1.6829843294966023, + "grad_norm": 0.32901066541671753, + "learning_rate": 0.00014629934210526313, + "loss": 0.4348, + "step": 1517 + }, + { + "epoch": 1.6840937456663432, + "grad_norm": 0.2796122133731842, + "learning_rate": 0.00014617598684210525, + "loss": 0.2749, + "step": 1518 + }, + { + "epoch": 1.6852031618360837, + "grad_norm": 0.2737559378147125, + "learning_rate": 0.00014605263157894735, + "loss": 0.3413, + "step": 1519 + }, + { + "epoch": 1.6863125780058246, + "grad_norm": 0.5847452282905579, + "learning_rate": 0.00014592927631578947, + "loss": 0.5046, + "step": 1520 + }, + { + "epoch": 1.687421994175565, + "grad_norm": 0.42608511447906494, + "learning_rate": 0.00014580592105263156, + "loss": 0.511, + "step": 1521 + }, + { + "epoch": 1.6885314103453057, + "grad_norm": 0.3839153051376343, + "learning_rate": 0.00014568256578947365, + "loss": 0.4812, + "step": 1522 + }, + { + "epoch": 1.6896408265150464, + "grad_norm": 0.2144833356142044, + "learning_rate": 0.00014555921052631577, + "loss": 0.479, + "step": 1523 + }, + { + "epoch": 1.6907502426847871, + "grad_norm": 0.34472957253456116, + "learning_rate": 0.0001454358552631579, + "loss": 0.356, + "step": 1524 + }, + { + "epoch": 1.6918596588545278, + "grad_norm": 0.29683181643486023, + "learning_rate": 0.0001453125, + "loss": 0.385, + "step": 1525 + }, + { + "epoch": 1.6929690750242685, + "grad_norm": 0.3433467447757721, + "learning_rate": 0.00014518914473684208, + "loss": 0.4606, + "step": 1526 + }, + { + "epoch": 1.6940784911940092, + "grad_norm": 0.3034195601940155, + "learning_rate": 0.0001450657894736842, + "loss": 0.5594, + "step": 1527 + }, + { + "epoch": 1.6951879073637497, + "grad_norm": 0.49384185671806335, + "learning_rate": 0.00014494243421052632, + "loss": 0.6995, + "step": 1528 + }, + { + "epoch": 1.6962973235334906, + "grad_norm": 0.5246427655220032, + "learning_rate": 0.00014481907894736842, + "loss": 0.6432, + "step": 1529 + }, + { + "epoch": 1.697406739703231, + "grad_norm": 0.26392480731010437, + "learning_rate": 0.0001446957236842105, + "loss": 0.5626, + "step": 1530 + }, + { + "epoch": 1.698516155872972, + "grad_norm": 0.27015626430511475, + "learning_rate": 0.0001445723684210526, + "loss": 0.4244, + "step": 1531 + }, + { + "epoch": 1.6996255720427125, + "grad_norm": 0.3427369296550751, + "learning_rate": 0.00014444901315789472, + "loss": 0.4015, + "step": 1532 + }, + { + "epoch": 1.7007349882124532, + "grad_norm": 0.4389760196208954, + "learning_rate": 0.00014432565789473684, + "loss": 0.654, + "step": 1533 + }, + { + "epoch": 1.7018444043821939, + "grad_norm": 0.36939921975135803, + "learning_rate": 0.00014420230263157894, + "loss": 0.6009, + "step": 1534 + }, + { + "epoch": 1.7029538205519346, + "grad_norm": 0.2916509211063385, + "learning_rate": 0.00014407894736842103, + "loss": 0.4624, + "step": 1535 + }, + { + "epoch": 1.7040632367216753, + "grad_norm": 0.5189476013183594, + "learning_rate": 0.00014395559210526315, + "loss": 0.5838, + "step": 1536 + }, + { + "epoch": 1.7051726528914157, + "grad_norm": 0.2686052620410919, + "learning_rate": 0.00014383223684210527, + "loss": 0.9601, + "step": 1537 + }, + { + "epoch": 1.7062820690611566, + "grad_norm": 0.28845494985580444, + "learning_rate": 0.00014370888157894736, + "loss": 0.3672, + "step": 1538 + }, + { + "epoch": 1.7073914852308971, + "grad_norm": 0.21178792417049408, + "learning_rate": 0.00014358552631578946, + "loss": 0.3064, + "step": 1539 + }, + { + "epoch": 1.708500901400638, + "grad_norm": 0.2538648843765259, + "learning_rate": 0.00014346217105263158, + "loss": 0.4975, + "step": 1540 + }, + { + "epoch": 1.7096103175703785, + "grad_norm": 0.4006761908531189, + "learning_rate": 0.00014333881578947367, + "loss": 0.5619, + "step": 1541 + }, + { + "epoch": 1.7107197337401192, + "grad_norm": 0.3077350854873657, + "learning_rate": 0.0001432154605263158, + "loss": 0.6509, + "step": 1542 + }, + { + "epoch": 1.71182914990986, + "grad_norm": 0.29142218828201294, + "learning_rate": 0.00014309210526315788, + "loss": 0.4765, + "step": 1543 + }, + { + "epoch": 1.7129385660796006, + "grad_norm": 0.3905639350414276, + "learning_rate": 0.00014296874999999998, + "loss": 0.8906, + "step": 1544 + }, + { + "epoch": 1.7140479822493413, + "grad_norm": 0.32861366868019104, + "learning_rate": 0.0001428453947368421, + "loss": 0.6136, + "step": 1545 + }, + { + "epoch": 1.715157398419082, + "grad_norm": 0.20155826210975647, + "learning_rate": 0.0001427220394736842, + "loss": 0.4392, + "step": 1546 + }, + { + "epoch": 1.7162668145888227, + "grad_norm": 0.35804450511932373, + "learning_rate": 0.0001425986842105263, + "loss": 0.6582, + "step": 1547 + }, + { + "epoch": 1.7173762307585632, + "grad_norm": 0.26054689288139343, + "learning_rate": 0.0001424753289473684, + "loss": 0.4493, + "step": 1548 + }, + { + "epoch": 1.718485646928304, + "grad_norm": 0.25189530849456787, + "learning_rate": 0.00014235197368421052, + "loss": 0.4806, + "step": 1549 + }, + { + "epoch": 1.7195950630980446, + "grad_norm": 0.3394787609577179, + "learning_rate": 0.00014222861842105262, + "loss": 0.5681, + "step": 1550 + }, + { + "epoch": 1.7207044792677855, + "grad_norm": 0.3084029257297516, + "learning_rate": 0.0001421052631578947, + "loss": 0.6309, + "step": 1551 + }, + { + "epoch": 1.721813895437526, + "grad_norm": 0.3268156945705414, + "learning_rate": 0.00014198190789473683, + "loss": 0.4937, + "step": 1552 + }, + { + "epoch": 1.7229233116072666, + "grad_norm": 0.2543306350708008, + "learning_rate": 0.00014185855263157895, + "loss": 0.3895, + "step": 1553 + }, + { + "epoch": 1.7240327277770073, + "grad_norm": 0.2428501844406128, + "learning_rate": 0.00014173519736842104, + "loss": 0.4724, + "step": 1554 + }, + { + "epoch": 1.725142143946748, + "grad_norm": 0.34834590554237366, + "learning_rate": 0.00014161184210526314, + "loss": 0.4686, + "step": 1555 + }, + { + "epoch": 1.7262515601164887, + "grad_norm": 0.2403583824634552, + "learning_rate": 0.00014148848684210526, + "loss": 0.3847, + "step": 1556 + }, + { + "epoch": 1.7273609762862294, + "grad_norm": 0.38176214694976807, + "learning_rate": 0.00014136513157894735, + "loss": 0.4869, + "step": 1557 + }, + { + "epoch": 1.7284703924559701, + "grad_norm": 0.2659490406513214, + "learning_rate": 0.00014124177631578947, + "loss": 0.4449, + "step": 1558 + }, + { + "epoch": 1.7295798086257106, + "grad_norm": 0.2423594892024994, + "learning_rate": 0.00014111842105263156, + "loss": 0.5206, + "step": 1559 + }, + { + "epoch": 1.7306892247954515, + "grad_norm": 0.3293440639972687, + "learning_rate": 0.00014099506578947366, + "loss": 0.621, + "step": 1560 + }, + { + "epoch": 1.731798640965192, + "grad_norm": 0.27292686700820923, + "learning_rate": 0.00014087171052631578, + "loss": 0.4955, + "step": 1561 + }, + { + "epoch": 1.732908057134933, + "grad_norm": 0.3719004690647125, + "learning_rate": 0.0001407483552631579, + "loss": 0.4081, + "step": 1562 + }, + { + "epoch": 1.7340174733046734, + "grad_norm": 0.3784489631652832, + "learning_rate": 0.000140625, + "loss": 0.4212, + "step": 1563 + }, + { + "epoch": 1.735126889474414, + "grad_norm": 0.24494099617004395, + "learning_rate": 0.00014050164473684208, + "loss": 0.4698, + "step": 1564 + }, + { + "epoch": 1.7362363056441548, + "grad_norm": 0.2339191883802414, + "learning_rate": 0.0001403782894736842, + "loss": 0.3486, + "step": 1565 + }, + { + "epoch": 1.7373457218138955, + "grad_norm": 0.321445107460022, + "learning_rate": 0.00014025493421052632, + "loss": 0.8323, + "step": 1566 + }, + { + "epoch": 1.7384551379836362, + "grad_norm": 0.3625154495239258, + "learning_rate": 0.00014013157894736842, + "loss": 0.3676, + "step": 1567 + }, + { + "epoch": 1.7395645541533766, + "grad_norm": 0.30214935541152954, + "learning_rate": 0.0001400082236842105, + "loss": 0.611, + "step": 1568 + }, + { + "epoch": 1.7406739703231175, + "grad_norm": 0.3197210133075714, + "learning_rate": 0.0001398848684210526, + "loss": 0.5491, + "step": 1569 + }, + { + "epoch": 1.741783386492858, + "grad_norm": 0.31939029693603516, + "learning_rate": 0.00013976151315789472, + "loss": 0.5426, + "step": 1570 + }, + { + "epoch": 1.742892802662599, + "grad_norm": 0.470907986164093, + "learning_rate": 0.00013963815789473684, + "loss": 0.6622, + "step": 1571 + }, + { + "epoch": 1.7440022188323394, + "grad_norm": 0.2162821739912033, + "learning_rate": 0.00013951480263157894, + "loss": 0.5728, + "step": 1572 + }, + { + "epoch": 1.74511163500208, + "grad_norm": 0.24964164197444916, + "learning_rate": 0.00013939144736842103, + "loss": 0.6929, + "step": 1573 + }, + { + "epoch": 1.7462210511718208, + "grad_norm": 0.34951767325401306, + "learning_rate": 0.00013926809210526315, + "loss": 0.3177, + "step": 1574 + }, + { + "epoch": 1.7473304673415615, + "grad_norm": 0.2987998425960541, + "learning_rate": 0.00013914473684210527, + "loss": 0.4462, + "step": 1575 + }, + { + "epoch": 1.7484398835113022, + "grad_norm": 0.362047016620636, + "learning_rate": 0.00013902138157894736, + "loss": 0.4171, + "step": 1576 + }, + { + "epoch": 1.749549299681043, + "grad_norm": 0.2592370808124542, + "learning_rate": 0.00013889802631578946, + "loss": 0.2222, + "step": 1577 + }, + { + "epoch": 1.7506587158507836, + "grad_norm": 0.26247555017471313, + "learning_rate": 0.00013877467105263158, + "loss": 0.5498, + "step": 1578 + }, + { + "epoch": 1.751768132020524, + "grad_norm": 0.28997063636779785, + "learning_rate": 0.00013865131578947367, + "loss": 0.537, + "step": 1579 + }, + { + "epoch": 1.752877548190265, + "grad_norm": 0.31275662779808044, + "learning_rate": 0.0001385279605263158, + "loss": 0.5331, + "step": 1580 + }, + { + "epoch": 1.7539869643600055, + "grad_norm": 0.3327484130859375, + "learning_rate": 0.00013840460526315788, + "loss": 0.4344, + "step": 1581 + }, + { + "epoch": 1.7550963805297464, + "grad_norm": 0.31380587816238403, + "learning_rate": 0.00013828124999999998, + "loss": 0.641, + "step": 1582 + }, + { + "epoch": 1.7562057966994868, + "grad_norm": 0.2786813974380493, + "learning_rate": 0.0001381578947368421, + "loss": 0.3636, + "step": 1583 + }, + { + "epoch": 1.7573152128692275, + "grad_norm": 0.33598342537879944, + "learning_rate": 0.0001380345394736842, + "loss": 0.5949, + "step": 1584 + }, + { + "epoch": 1.7584246290389682, + "grad_norm": 0.28291746973991394, + "learning_rate": 0.0001379111842105263, + "loss": 0.3358, + "step": 1585 + }, + { + "epoch": 1.759534045208709, + "grad_norm": 0.22912530601024628, + "learning_rate": 0.0001377878289473684, + "loss": 0.4407, + "step": 1586 + }, + { + "epoch": 1.7606434613784496, + "grad_norm": 0.3488161861896515, + "learning_rate": 0.00013766447368421052, + "loss": 0.4591, + "step": 1587 + }, + { + "epoch": 1.76175287754819, + "grad_norm": 0.31319087743759155, + "learning_rate": 0.00013754111842105262, + "loss": 0.496, + "step": 1588 + }, + { + "epoch": 1.762862293717931, + "grad_norm": 0.31536537408828735, + "learning_rate": 0.0001374177631578947, + "loss": 0.4566, + "step": 1589 + }, + { + "epoch": 1.7639717098876715, + "grad_norm": 0.38071408867836, + "learning_rate": 0.00013729440789473683, + "loss": 0.4728, + "step": 1590 + }, + { + "epoch": 1.7650811260574124, + "grad_norm": 0.25146248936653137, + "learning_rate": 0.00013717105263157895, + "loss": 0.3651, + "step": 1591 + }, + { + "epoch": 1.7661905422271529, + "grad_norm": 0.262510746717453, + "learning_rate": 0.00013704769736842104, + "loss": 0.3982, + "step": 1592 + }, + { + "epoch": 1.7672999583968938, + "grad_norm": 0.2857152819633484, + "learning_rate": 0.00013692434210526314, + "loss": 0.3916, + "step": 1593 + }, + { + "epoch": 1.7684093745666343, + "grad_norm": 0.5477368831634521, + "learning_rate": 0.00013680098684210526, + "loss": 0.5775, + "step": 1594 + }, + { + "epoch": 1.769518790736375, + "grad_norm": 0.30496469140052795, + "learning_rate": 0.00013667763157894735, + "loss": 0.6863, + "step": 1595 + }, + { + "epoch": 1.7706282069061157, + "grad_norm": 0.32225456833839417, + "learning_rate": 0.00013655427631578947, + "loss": 0.4028, + "step": 1596 + }, + { + "epoch": 1.7717376230758564, + "grad_norm": 0.25836458802223206, + "learning_rate": 0.00013643092105263156, + "loss": 0.4638, + "step": 1597 + }, + { + "epoch": 1.772847039245597, + "grad_norm": 0.24989454448223114, + "learning_rate": 0.00013630756578947366, + "loss": 0.5425, + "step": 1598 + }, + { + "epoch": 1.7739564554153375, + "grad_norm": 0.46502557396888733, + "learning_rate": 0.00013618421052631578, + "loss": 0.5317, + "step": 1599 + }, + { + "epoch": 1.7750658715850784, + "grad_norm": 0.32870060205459595, + "learning_rate": 0.0001360608552631579, + "loss": 0.5497, + "step": 1600 + }, + { + "epoch": 1.776175287754819, + "grad_norm": 0.24722667038440704, + "learning_rate": 0.0001359375, + "loss": 0.5929, + "step": 1601 + }, + { + "epoch": 1.7772847039245598, + "grad_norm": 0.3317899703979492, + "learning_rate": 0.00013581414473684208, + "loss": 0.4435, + "step": 1602 + }, + { + "epoch": 1.7783941200943003, + "grad_norm": 0.36830225586891174, + "learning_rate": 0.0001356907894736842, + "loss": 0.5876, + "step": 1603 + }, + { + "epoch": 1.779503536264041, + "grad_norm": 0.23982636630535126, + "learning_rate": 0.0001355674342105263, + "loss": 0.4906, + "step": 1604 + }, + { + "epoch": 1.7806129524337817, + "grad_norm": 0.3808034062385559, + "learning_rate": 0.00013544407894736842, + "loss": 0.6202, + "step": 1605 + }, + { + "epoch": 1.7817223686035224, + "grad_norm": 0.3208853006362915, + "learning_rate": 0.0001353207236842105, + "loss": 0.4002, + "step": 1606 + }, + { + "epoch": 1.782831784773263, + "grad_norm": 0.42497456073760986, + "learning_rate": 0.0001351973684210526, + "loss": 0.6453, + "step": 1607 + }, + { + "epoch": 1.7839412009430038, + "grad_norm": 0.5558460354804993, + "learning_rate": 0.00013507401315789472, + "loss": 0.424, + "step": 1608 + }, + { + "epoch": 1.7850506171127445, + "grad_norm": 0.38764357566833496, + "learning_rate": 0.00013495065789473684, + "loss": 0.4115, + "step": 1609 + }, + { + "epoch": 1.786160033282485, + "grad_norm": 0.3829018771648407, + "learning_rate": 0.00013482730263157894, + "loss": 0.4814, + "step": 1610 + }, + { + "epoch": 1.7872694494522259, + "grad_norm": 0.4661031663417816, + "learning_rate": 0.00013470394736842103, + "loss": 0.6416, + "step": 1611 + }, + { + "epoch": 1.7883788656219664, + "grad_norm": 0.2798513174057007, + "learning_rate": 0.00013458059210526315, + "loss": 0.4603, + "step": 1612 + }, + { + "epoch": 1.7894882817917073, + "grad_norm": 0.3700726330280304, + "learning_rate": 0.00013445723684210527, + "loss": 0.4821, + "step": 1613 + }, + { + "epoch": 1.7905976979614477, + "grad_norm": 0.35398468375205994, + "learning_rate": 0.00013433388157894736, + "loss": 0.5852, + "step": 1614 + }, + { + "epoch": 1.7917071141311884, + "grad_norm": 0.37176424264907837, + "learning_rate": 0.00013421052631578946, + "loss": 0.605, + "step": 1615 + }, + { + "epoch": 1.7928165303009291, + "grad_norm": 0.2966163754463196, + "learning_rate": 0.00013408717105263158, + "loss": 0.4163, + "step": 1616 + }, + { + "epoch": 1.7939259464706698, + "grad_norm": 0.3742397129535675, + "learning_rate": 0.00013396381578947367, + "loss": 0.4507, + "step": 1617 + }, + { + "epoch": 1.7950353626404105, + "grad_norm": 0.36498209834098816, + "learning_rate": 0.0001338404605263158, + "loss": 0.8315, + "step": 1618 + }, + { + "epoch": 1.796144778810151, + "grad_norm": 0.4069786071777344, + "learning_rate": 0.00013371710526315788, + "loss": 0.5758, + "step": 1619 + }, + { + "epoch": 1.797254194979892, + "grad_norm": 0.30974453687667847, + "learning_rate": 0.00013359374999999998, + "loss": 0.7786, + "step": 1620 + }, + { + "epoch": 1.7983636111496324, + "grad_norm": 0.5354030728340149, + "learning_rate": 0.0001334703947368421, + "loss": 0.5513, + "step": 1621 + }, + { + "epoch": 1.7994730273193733, + "grad_norm": 0.24419055879116058, + "learning_rate": 0.0001333470394736842, + "loss": 0.4483, + "step": 1622 + }, + { + "epoch": 1.8005824434891138, + "grad_norm": 0.33314335346221924, + "learning_rate": 0.0001332236842105263, + "loss": 0.5992, + "step": 1623 + }, + { + "epoch": 1.8016918596588545, + "grad_norm": 0.22179794311523438, + "learning_rate": 0.0001331003289473684, + "loss": 0.382, + "step": 1624 + }, + { + "epoch": 1.8028012758285952, + "grad_norm": 0.26683637499809265, + "learning_rate": 0.00013297697368421052, + "loss": 0.5549, + "step": 1625 + }, + { + "epoch": 1.8039106919983359, + "grad_norm": 0.2577199339866638, + "learning_rate": 0.00013285361842105262, + "loss": 0.5963, + "step": 1626 + }, + { + "epoch": 1.8050201081680766, + "grad_norm": 0.30272090435028076, + "learning_rate": 0.0001327302631578947, + "loss": 0.4767, + "step": 1627 + }, + { + "epoch": 1.8061295243378173, + "grad_norm": 0.4484618306159973, + "learning_rate": 0.00013260690789473683, + "loss": 0.4336, + "step": 1628 + }, + { + "epoch": 1.807238940507558, + "grad_norm": 0.3869413733482361, + "learning_rate": 0.00013248355263157892, + "loss": 0.3799, + "step": 1629 + }, + { + "epoch": 1.8083483566772984, + "grad_norm": 0.26756906509399414, + "learning_rate": 0.00013236019736842104, + "loss": 0.5128, + "step": 1630 + }, + { + "epoch": 1.8094577728470393, + "grad_norm": 0.21581970155239105, + "learning_rate": 0.00013223684210526314, + "loss": 0.5776, + "step": 1631 + }, + { + "epoch": 1.8105671890167798, + "grad_norm": 0.5925078392028809, + "learning_rate": 0.00013211348684210526, + "loss": 0.4828, + "step": 1632 + }, + { + "epoch": 1.8116766051865207, + "grad_norm": 0.29944899678230286, + "learning_rate": 0.00013199013157894735, + "loss": 0.5219, + "step": 1633 + }, + { + "epoch": 1.8127860213562612, + "grad_norm": 0.3090478479862213, + "learning_rate": 0.00013186677631578947, + "loss": 0.7766, + "step": 1634 + }, + { + "epoch": 1.813895437526002, + "grad_norm": 0.27250027656555176, + "learning_rate": 0.00013174342105263156, + "loss": 0.5283, + "step": 1635 + }, + { + "epoch": 1.8150048536957426, + "grad_norm": 0.30475983023643494, + "learning_rate": 0.00013162006578947366, + "loss": 0.4438, + "step": 1636 + }, + { + "epoch": 1.8161142698654833, + "grad_norm": 0.4503616690635681, + "learning_rate": 0.00013149671052631578, + "loss": 0.6286, + "step": 1637 + }, + { + "epoch": 1.817223686035224, + "grad_norm": 0.3719213306903839, + "learning_rate": 0.0001313733552631579, + "loss": 0.4938, + "step": 1638 + }, + { + "epoch": 1.8183331022049647, + "grad_norm": 0.2590722143650055, + "learning_rate": 0.00013125, + "loss": 0.3978, + "step": 1639 + }, + { + "epoch": 1.8194425183747054, + "grad_norm": 0.4789052903652191, + "learning_rate": 0.00013112664473684209, + "loss": 0.4542, + "step": 1640 + }, + { + "epoch": 1.8205519345444459, + "grad_norm": 0.3678234815597534, + "learning_rate": 0.0001310032894736842, + "loss": 0.4437, + "step": 1641 + }, + { + "epoch": 1.8216613507141868, + "grad_norm": 0.21813832223415375, + "learning_rate": 0.0001308799342105263, + "loss": 0.3382, + "step": 1642 + }, + { + "epoch": 1.8227707668839273, + "grad_norm": 0.2665456235408783, + "learning_rate": 0.00013075657894736842, + "loss": 0.3803, + "step": 1643 + }, + { + "epoch": 1.8238801830536682, + "grad_norm": 0.26693305373191833, + "learning_rate": 0.0001306332236842105, + "loss": 0.4494, + "step": 1644 + }, + { + "epoch": 1.8249895992234086, + "grad_norm": 0.22977307438850403, + "learning_rate": 0.0001305098684210526, + "loss": 0.3472, + "step": 1645 + }, + { + "epoch": 1.8260990153931493, + "grad_norm": 0.38384175300598145, + "learning_rate": 0.00013038651315789473, + "loss": 0.7401, + "step": 1646 + }, + { + "epoch": 1.82720843156289, + "grad_norm": 0.26645827293395996, + "learning_rate": 0.00013026315789473685, + "loss": 0.5021, + "step": 1647 + }, + { + "epoch": 1.8283178477326307, + "grad_norm": 0.28554990887641907, + "learning_rate": 0.00013013980263157894, + "loss": 0.3429, + "step": 1648 + }, + { + "epoch": 1.8294272639023714, + "grad_norm": 0.23377835750579834, + "learning_rate": 0.00013001644736842103, + "loss": 0.4639, + "step": 1649 + }, + { + "epoch": 1.830536680072112, + "grad_norm": 0.323998361825943, + "learning_rate": 0.00012989309210526315, + "loss": 0.5517, + "step": 1650 + }, + { + "epoch": 1.8316460962418528, + "grad_norm": 0.24397112429141998, + "learning_rate": 0.00012976973684210527, + "loss": 0.4443, + "step": 1651 + }, + { + "epoch": 1.8327555124115933, + "grad_norm": 0.23545107245445251, + "learning_rate": 0.00012964638157894737, + "loss": 0.6529, + "step": 1652 + }, + { + "epoch": 1.8338649285813342, + "grad_norm": 0.3781031668186188, + "learning_rate": 0.00012952302631578946, + "loss": 0.3811, + "step": 1653 + }, + { + "epoch": 1.8349743447510747, + "grad_norm": 0.3218782842159271, + "learning_rate": 0.00012939967105263155, + "loss": 0.5818, + "step": 1654 + }, + { + "epoch": 1.8360837609208154, + "grad_norm": 0.31816890835762024, + "learning_rate": 0.00012927631578947367, + "loss": 0.515, + "step": 1655 + }, + { + "epoch": 1.837193177090556, + "grad_norm": 0.3660028278827667, + "learning_rate": 0.0001291529605263158, + "loss": 0.6154, + "step": 1656 + }, + { + "epoch": 1.8383025932602968, + "grad_norm": 0.25920218229293823, + "learning_rate": 0.00012902960526315789, + "loss": 0.5614, + "step": 1657 + }, + { + "epoch": 1.8394120094300375, + "grad_norm": 0.3921451270580292, + "learning_rate": 0.00012890624999999998, + "loss": 0.5109, + "step": 1658 + }, + { + "epoch": 1.8405214255997782, + "grad_norm": 0.30347323417663574, + "learning_rate": 0.0001287828947368421, + "loss": 0.5139, + "step": 1659 + }, + { + "epoch": 1.8416308417695189, + "grad_norm": 0.36900901794433594, + "learning_rate": 0.0001286595394736842, + "loss": 0.3862, + "step": 1660 + }, + { + "epoch": 1.8427402579392593, + "grad_norm": 0.23627950251102448, + "learning_rate": 0.0001285361842105263, + "loss": 0.3962, + "step": 1661 + }, + { + "epoch": 1.8438496741090002, + "grad_norm": 0.3626163601875305, + "learning_rate": 0.0001284128289473684, + "loss": 0.5779, + "step": 1662 + }, + { + "epoch": 1.8449590902787407, + "grad_norm": 0.3031785488128662, + "learning_rate": 0.00012828947368421053, + "loss": 0.4422, + "step": 1663 + }, + { + "epoch": 1.8460685064484816, + "grad_norm": 0.27116191387176514, + "learning_rate": 0.00012816611842105262, + "loss": 0.4825, + "step": 1664 + }, + { + "epoch": 1.8471779226182221, + "grad_norm": 0.23863159120082855, + "learning_rate": 0.0001280427631578947, + "loss": 0.6105, + "step": 1665 + }, + { + "epoch": 1.8482873387879628, + "grad_norm": 0.3026638329029083, + "learning_rate": 0.00012791940789473683, + "loss": 0.5599, + "step": 1666 + }, + { + "epoch": 1.8493967549577035, + "grad_norm": 0.2904566526412964, + "learning_rate": 0.00012779605263157893, + "loss": 0.4156, + "step": 1667 + }, + { + "epoch": 1.8505061711274442, + "grad_norm": 0.2892657518386841, + "learning_rate": 0.00012767269736842105, + "loss": 0.3779, + "step": 1668 + }, + { + "epoch": 1.851615587297185, + "grad_norm": 0.27468252182006836, + "learning_rate": 0.00012754934210526314, + "loss": 0.4811, + "step": 1669 + }, + { + "epoch": 1.8527250034669254, + "grad_norm": 0.33178287744522095, + "learning_rate": 0.00012742598684210526, + "loss": 0.6331, + "step": 1670 + }, + { + "epoch": 1.8538344196366663, + "grad_norm": 0.3048073351383209, + "learning_rate": 0.00012730263157894735, + "loss": 0.4763, + "step": 1671 + }, + { + "epoch": 1.8549438358064068, + "grad_norm": 0.2505081593990326, + "learning_rate": 0.00012717927631578947, + "loss": 0.6395, + "step": 1672 + }, + { + "epoch": 1.8560532519761477, + "grad_norm": 0.3426123261451721, + "learning_rate": 0.00012705592105263157, + "loss": 0.4118, + "step": 1673 + }, + { + "epoch": 1.8571626681458882, + "grad_norm": 0.2770869731903076, + "learning_rate": 0.00012693256578947366, + "loss": 0.4372, + "step": 1674 + }, + { + "epoch": 1.8582720843156288, + "grad_norm": 0.28371554613113403, + "learning_rate": 0.00012680921052631578, + "loss": 0.5352, + "step": 1675 + }, + { + "epoch": 1.8593815004853695, + "grad_norm": 0.424926221370697, + "learning_rate": 0.0001266858552631579, + "loss": 0.5825, + "step": 1676 + }, + { + "epoch": 1.8604909166551102, + "grad_norm": 0.42135924100875854, + "learning_rate": 0.0001265625, + "loss": 0.54, + "step": 1677 + }, + { + "epoch": 1.861600332824851, + "grad_norm": 0.35227394104003906, + "learning_rate": 0.00012643914473684209, + "loss": 0.4149, + "step": 1678 + }, + { + "epoch": 1.8627097489945916, + "grad_norm": 0.368327796459198, + "learning_rate": 0.00012631578947368418, + "loss": 0.5194, + "step": 1679 + }, + { + "epoch": 1.8638191651643323, + "grad_norm": 0.31259453296661377, + "learning_rate": 0.0001261924342105263, + "loss": 0.5897, + "step": 1680 + }, + { + "epoch": 1.8649285813340728, + "grad_norm": 0.42234233021736145, + "learning_rate": 0.00012606907894736842, + "loss": 0.3769, + "step": 1681 + }, + { + "epoch": 1.8660379975038137, + "grad_norm": 0.258651465177536, + "learning_rate": 0.0001259457236842105, + "loss": 0.443, + "step": 1682 + }, + { + "epoch": 1.8671474136735542, + "grad_norm": 0.3242909014225006, + "learning_rate": 0.0001258223684210526, + "loss": 0.5423, + "step": 1683 + }, + { + "epoch": 1.868256829843295, + "grad_norm": 0.3746740221977234, + "learning_rate": 0.00012569901315789473, + "loss": 0.4365, + "step": 1684 + }, + { + "epoch": 1.8693662460130356, + "grad_norm": 0.2767789363861084, + "learning_rate": 0.00012557565789473685, + "loss": 0.5011, + "step": 1685 + }, + { + "epoch": 1.8704756621827763, + "grad_norm": 0.41377684473991394, + "learning_rate": 0.00012545230263157894, + "loss": 0.4733, + "step": 1686 + }, + { + "epoch": 1.871585078352517, + "grad_norm": 0.2723773717880249, + "learning_rate": 0.00012532894736842103, + "loss": 0.5801, + "step": 1687 + }, + { + "epoch": 1.8726944945222577, + "grad_norm": 0.28274834156036377, + "learning_rate": 0.00012520559210526315, + "loss": 0.6009, + "step": 1688 + }, + { + "epoch": 1.8738039106919984, + "grad_norm": 0.3209463655948639, + "learning_rate": 0.00012508223684210525, + "loss": 0.5851, + "step": 1689 + }, + { + "epoch": 1.874913326861739, + "grad_norm": 0.24118223786354065, + "learning_rate": 0.00012495888157894737, + "loss": 0.6186, + "step": 1690 + }, + { + "epoch": 1.8760227430314798, + "grad_norm": 0.3865971565246582, + "learning_rate": 0.00012483552631578946, + "loss": 0.5411, + "step": 1691 + }, + { + "epoch": 1.8771321592012202, + "grad_norm": 0.23888447880744934, + "learning_rate": 0.00012471217105263155, + "loss": 0.3907, + "step": 1692 + }, + { + "epoch": 1.8782415753709611, + "grad_norm": 0.290234237909317, + "learning_rate": 0.00012458881578947367, + "loss": 0.3822, + "step": 1693 + }, + { + "epoch": 1.8793509915407016, + "grad_norm": 0.2845550775527954, + "learning_rate": 0.0001244654605263158, + "loss": 0.4248, + "step": 1694 + }, + { + "epoch": 1.8804604077104425, + "grad_norm": 0.3374759554862976, + "learning_rate": 0.0001243421052631579, + "loss": 0.7508, + "step": 1695 + }, + { + "epoch": 1.881569823880183, + "grad_norm": 0.39034581184387207, + "learning_rate": 0.00012421874999999998, + "loss": 0.4806, + "step": 1696 + }, + { + "epoch": 1.8826792400499237, + "grad_norm": 0.39774978160858154, + "learning_rate": 0.0001240953947368421, + "loss": 0.2821, + "step": 1697 + }, + { + "epoch": 1.8837886562196644, + "grad_norm": 0.31861943006515503, + "learning_rate": 0.0001239720394736842, + "loss": 0.487, + "step": 1698 + }, + { + "epoch": 1.884898072389405, + "grad_norm": 0.3267800807952881, + "learning_rate": 0.00012384868421052631, + "loss": 0.4185, + "step": 1699 + }, + { + "epoch": 1.8860074885591458, + "grad_norm": 0.28482627868652344, + "learning_rate": 0.0001237253289473684, + "loss": 0.5111, + "step": 1700 + }, + { + "epoch": 1.8871169047288863, + "grad_norm": 0.3203260898590088, + "learning_rate": 0.00012360197368421053, + "loss": 0.465, + "step": 1701 + }, + { + "epoch": 1.8882263208986272, + "grad_norm": 0.2945539057254791, + "learning_rate": 0.00012347861842105262, + "loss": 0.615, + "step": 1702 + }, + { + "epoch": 1.8893357370683677, + "grad_norm": 0.29036056995391846, + "learning_rate": 0.00012335526315789471, + "loss": 0.5022, + "step": 1703 + }, + { + "epoch": 1.8904451532381086, + "grad_norm": 0.22323249280452728, + "learning_rate": 0.00012323190789473683, + "loss": 0.5774, + "step": 1704 + }, + { + "epoch": 1.891554569407849, + "grad_norm": 0.3879876434803009, + "learning_rate": 0.00012310855263157893, + "loss": 0.732, + "step": 1705 + }, + { + "epoch": 1.8926639855775897, + "grad_norm": 0.49169594049453735, + "learning_rate": 0.00012298519736842105, + "loss": 0.6079, + "step": 1706 + }, + { + "epoch": 1.8937734017473304, + "grad_norm": 0.32131388783454895, + "learning_rate": 0.00012286184210526314, + "loss": 0.4357, + "step": 1707 + }, + { + "epoch": 1.8948828179170711, + "grad_norm": 0.2757743299007416, + "learning_rate": 0.00012273848684210526, + "loss": 0.4277, + "step": 1708 + }, + { + "epoch": 1.8959922340868118, + "grad_norm": 0.2627353370189667, + "learning_rate": 0.00012261513157894735, + "loss": 0.4186, + "step": 1709 + }, + { + "epoch": 1.8971016502565525, + "grad_norm": 0.24395854771137238, + "learning_rate": 0.00012249177631578947, + "loss": 0.4422, + "step": 1710 + }, + { + "epoch": 1.8982110664262932, + "grad_norm": 0.2920277416706085, + "learning_rate": 0.00012236842105263157, + "loss": 0.3717, + "step": 1711 + }, + { + "epoch": 1.8993204825960337, + "grad_norm": 0.22231972217559814, + "learning_rate": 0.00012224506578947366, + "loss": 0.4537, + "step": 1712 + }, + { + "epoch": 1.9004298987657746, + "grad_norm": 0.2766577899456024, + "learning_rate": 0.00012212171052631578, + "loss": 0.4264, + "step": 1713 + }, + { + "epoch": 1.901539314935515, + "grad_norm": 0.30484381318092346, + "learning_rate": 0.00012199835526315789, + "loss": 0.4421, + "step": 1714 + }, + { + "epoch": 1.902648731105256, + "grad_norm": 0.36658528447151184, + "learning_rate": 0.000121875, + "loss": 0.4917, + "step": 1715 + }, + { + "epoch": 1.9037581472749965, + "grad_norm": 0.38978394865989685, + "learning_rate": 0.00012175164473684209, + "loss": 0.5901, + "step": 1716 + }, + { + "epoch": 1.9048675634447372, + "grad_norm": 0.3046998381614685, + "learning_rate": 0.0001216282894736842, + "loss": 0.4495, + "step": 1717 + }, + { + "epoch": 1.9059769796144779, + "grad_norm": 0.31667011976242065, + "learning_rate": 0.00012150493421052631, + "loss": 0.4073, + "step": 1718 + }, + { + "epoch": 1.9070863957842186, + "grad_norm": 0.3211687207221985, + "learning_rate": 0.00012138157894736841, + "loss": 0.4975, + "step": 1719 + }, + { + "epoch": 1.9081958119539593, + "grad_norm": 0.2827535569667816, + "learning_rate": 0.00012125822368421051, + "loss": 0.4363, + "step": 1720 + }, + { + "epoch": 1.9093052281236997, + "grad_norm": 0.28672489523887634, + "learning_rate": 0.00012113486842105262, + "loss": 0.5762, + "step": 1721 + }, + { + "epoch": 1.9104146442934407, + "grad_norm": 0.3268757462501526, + "learning_rate": 0.00012101151315789471, + "loss": 0.4126, + "step": 1722 + }, + { + "epoch": 1.9115240604631811, + "grad_norm": 0.3554566204547882, + "learning_rate": 0.00012088815789473683, + "loss": 0.5062, + "step": 1723 + }, + { + "epoch": 1.912633476632922, + "grad_norm": 0.3198055326938629, + "learning_rate": 0.00012076480263157894, + "loss": 0.4958, + "step": 1724 + }, + { + "epoch": 1.9137428928026625, + "grad_norm": 0.3657841980457306, + "learning_rate": 0.00012064144736842103, + "loss": 0.5931, + "step": 1725 + }, + { + "epoch": 1.9148523089724034, + "grad_norm": 0.45995911955833435, + "learning_rate": 0.00012051809210526314, + "loss": 0.5862, + "step": 1726 + }, + { + "epoch": 1.915961725142144, + "grad_norm": 0.4919174015522003, + "learning_rate": 0.00012039473684210526, + "loss": 0.452, + "step": 1727 + }, + { + "epoch": 1.9170711413118846, + "grad_norm": 0.3233271539211273, + "learning_rate": 0.00012027138157894737, + "loss": 0.6559, + "step": 1728 + }, + { + "epoch": 1.9181805574816253, + "grad_norm": 0.396419495344162, + "learning_rate": 0.00012014802631578946, + "loss": 0.7268, + "step": 1729 + }, + { + "epoch": 1.919289973651366, + "grad_norm": 0.2332264930009842, + "learning_rate": 0.00012002467105263157, + "loss": 0.3874, + "step": 1730 + }, + { + "epoch": 1.9203993898211067, + "grad_norm": 0.20889733731746674, + "learning_rate": 0.00011990131578947366, + "loss": 0.5648, + "step": 1731 + }, + { + "epoch": 1.9215088059908472, + "grad_norm": 0.25143593549728394, + "learning_rate": 0.00011977796052631578, + "loss": 0.4837, + "step": 1732 + }, + { + "epoch": 1.922618222160588, + "grad_norm": 0.2032875120639801, + "learning_rate": 0.00011965460526315789, + "loss": 0.4553, + "step": 1733 + }, + { + "epoch": 1.9237276383303286, + "grad_norm": 0.2216006964445114, + "learning_rate": 0.00011953125, + "loss": 0.3256, + "step": 1734 + }, + { + "epoch": 1.9248370545000695, + "grad_norm": 0.34091660380363464, + "learning_rate": 0.00011940789473684209, + "loss": 0.534, + "step": 1735 + }, + { + "epoch": 1.92594647066981, + "grad_norm": 0.35081061720848083, + "learning_rate": 0.0001192845394736842, + "loss": 0.4549, + "step": 1736 + }, + { + "epoch": 1.9270558868395506, + "grad_norm": 0.37153178453445435, + "learning_rate": 0.00011916118421052632, + "loss": 0.507, + "step": 1737 + }, + { + "epoch": 1.9281653030092913, + "grad_norm": 0.3207988142967224, + "learning_rate": 0.00011903782894736841, + "loss": 0.6104, + "step": 1738 + }, + { + "epoch": 1.929274719179032, + "grad_norm": 0.378360390663147, + "learning_rate": 0.00011891447368421052, + "loss": 0.5371, + "step": 1739 + }, + { + "epoch": 1.9303841353487727, + "grad_norm": 0.3643793761730194, + "learning_rate": 0.00011879111842105262, + "loss": 0.506, + "step": 1740 + }, + { + "epoch": 1.9314935515185134, + "grad_norm": 0.35685864090919495, + "learning_rate": 0.00011866776315789471, + "loss": 0.6205, + "step": 1741 + }, + { + "epoch": 1.9326029676882541, + "grad_norm": 0.349833220243454, + "learning_rate": 0.00011854440789473684, + "loss": 0.587, + "step": 1742 + }, + { + "epoch": 1.9337123838579946, + "grad_norm": 0.3674916923046112, + "learning_rate": 0.00011842105263157894, + "loss": 0.5616, + "step": 1743 + }, + { + "epoch": 1.9348218000277355, + "grad_norm": 0.4197103679180145, + "learning_rate": 0.00011829769736842104, + "loss": 0.6715, + "step": 1744 + }, + { + "epoch": 1.935931216197476, + "grad_norm": 0.2582911550998688, + "learning_rate": 0.00011817434210526314, + "loss": 0.3392, + "step": 1745 + }, + { + "epoch": 1.937040632367217, + "grad_norm": 0.3199860751628876, + "learning_rate": 0.00011805098684210526, + "loss": 0.4765, + "step": 1746 + }, + { + "epoch": 1.9381500485369574, + "grad_norm": 0.28448477387428284, + "learning_rate": 0.00011792763157894737, + "loss": 0.5821, + "step": 1747 + }, + { + "epoch": 1.939259464706698, + "grad_norm": 0.4114968180656433, + "learning_rate": 0.00011780427631578946, + "loss": 0.48, + "step": 1748 + }, + { + "epoch": 1.9403688808764388, + "grad_norm": 0.3065422773361206, + "learning_rate": 0.00011768092105263157, + "loss": 0.4095, + "step": 1749 + }, + { + "epoch": 1.9414782970461795, + "grad_norm": 0.33260229229927063, + "learning_rate": 0.00011755756578947366, + "loss": 0.5081, + "step": 1750 + }, + { + "epoch": 1.9425877132159202, + "grad_norm": 0.3044232130050659, + "learning_rate": 0.00011743421052631578, + "loss": 0.6083, + "step": 1751 + }, + { + "epoch": 1.9436971293856606, + "grad_norm": 0.3583667278289795, + "learning_rate": 0.00011731085526315789, + "loss": 0.6418, + "step": 1752 + }, + { + "epoch": 1.9448065455554016, + "grad_norm": 0.5549653172492981, + "learning_rate": 0.0001171875, + "loss": 0.8511, + "step": 1753 + }, + { + "epoch": 1.945915961725142, + "grad_norm": 0.24391904473304749, + "learning_rate": 0.00011706414473684209, + "loss": 0.4204, + "step": 1754 + }, + { + "epoch": 1.947025377894883, + "grad_norm": 0.4001742899417877, + "learning_rate": 0.0001169407894736842, + "loss": 0.4757, + "step": 1755 + }, + { + "epoch": 1.9481347940646234, + "grad_norm": 0.49259909987449646, + "learning_rate": 0.00011681743421052632, + "loss": 0.4522, + "step": 1756 + }, + { + "epoch": 1.9492442102343641, + "grad_norm": 0.3102129399776459, + "learning_rate": 0.00011669407894736841, + "loss": 0.3831, + "step": 1757 + }, + { + "epoch": 1.9503536264041048, + "grad_norm": 0.2639727294445038, + "learning_rate": 0.00011657072368421052, + "loss": 0.6364, + "step": 1758 + }, + { + "epoch": 1.9514630425738455, + "grad_norm": 0.31318995356559753, + "learning_rate": 0.00011644736842105262, + "loss": 0.5422, + "step": 1759 + }, + { + "epoch": 1.9525724587435862, + "grad_norm": 0.2877756357192993, + "learning_rate": 0.00011632401315789472, + "loss": 0.3984, + "step": 1760 + }, + { + "epoch": 1.953681874913327, + "grad_norm": 0.36178058385849, + "learning_rate": 0.00011620065789473684, + "loss": 0.6806, + "step": 1761 + }, + { + "epoch": 1.9547912910830676, + "grad_norm": 0.36867088079452515, + "learning_rate": 0.00011607730263157894, + "loss": 0.529, + "step": 1762 + }, + { + "epoch": 1.955900707252808, + "grad_norm": 0.25498855113983154, + "learning_rate": 0.00011595394736842104, + "loss": 0.6411, + "step": 1763 + }, + { + "epoch": 1.957010123422549, + "grad_norm": 0.26043468713760376, + "learning_rate": 0.00011583059210526314, + "loss": 0.5102, + "step": 1764 + }, + { + "epoch": 1.9581195395922895, + "grad_norm": 0.40660566091537476, + "learning_rate": 0.00011570723684210526, + "loss": 0.5582, + "step": 1765 + }, + { + "epoch": 1.9592289557620304, + "grad_norm": 0.4207366406917572, + "learning_rate": 0.00011558388157894736, + "loss": 0.7263, + "step": 1766 + }, + { + "epoch": 1.9603383719317709, + "grad_norm": 0.35944870114326477, + "learning_rate": 0.00011546052631578946, + "loss": 0.4644, + "step": 1767 + }, + { + "epoch": 1.9614477881015115, + "grad_norm": 0.2992507219314575, + "learning_rate": 0.00011533717105263157, + "loss": 0.7683, + "step": 1768 + }, + { + "epoch": 1.9625572042712522, + "grad_norm": 0.3475952744483948, + "learning_rate": 0.00011521381578947366, + "loss": 0.6288, + "step": 1769 + }, + { + "epoch": 1.963666620440993, + "grad_norm": 0.29175207018852234, + "learning_rate": 0.00011509046052631578, + "loss": 0.355, + "step": 1770 + }, + { + "epoch": 1.9647760366107336, + "grad_norm": 0.3024480640888214, + "learning_rate": 0.00011496710526315789, + "loss": 0.5916, + "step": 1771 + }, + { + "epoch": 1.9658854527804743, + "grad_norm": 0.310245543718338, + "learning_rate": 0.00011484375, + "loss": 0.6658, + "step": 1772 + }, + { + "epoch": 1.966994868950215, + "grad_norm": 0.2858862578868866, + "learning_rate": 0.00011472039473684209, + "loss": 0.5808, + "step": 1773 + }, + { + "epoch": 1.9681042851199555, + "grad_norm": 0.19843228161334991, + "learning_rate": 0.0001145970394736842, + "loss": 0.3837, + "step": 1774 + }, + { + "epoch": 1.9692137012896964, + "grad_norm": 0.37114304304122925, + "learning_rate": 0.00011447368421052632, + "loss": 0.7252, + "step": 1775 + }, + { + "epoch": 1.970323117459437, + "grad_norm": 0.3807290196418762, + "learning_rate": 0.00011435032894736841, + "loss": 0.4429, + "step": 1776 + }, + { + "epoch": 1.9714325336291778, + "grad_norm": 0.2850121557712555, + "learning_rate": 0.00011422697368421052, + "loss": 0.3464, + "step": 1777 + }, + { + "epoch": 1.9725419497989183, + "grad_norm": 0.34873002767562866, + "learning_rate": 0.00011410361842105262, + "loss": 0.4814, + "step": 1778 + }, + { + "epoch": 1.973651365968659, + "grad_norm": 0.42871007323265076, + "learning_rate": 0.00011398026315789472, + "loss": 0.4912, + "step": 1779 + }, + { + "epoch": 1.9747607821383997, + "grad_norm": 0.3286532163619995, + "learning_rate": 0.00011385690789473684, + "loss": 0.4862, + "step": 1780 + }, + { + "epoch": 1.9758701983081404, + "grad_norm": 0.3135276436805725, + "learning_rate": 0.00011373355263157894, + "loss": 0.3872, + "step": 1781 + }, + { + "epoch": 1.976979614477881, + "grad_norm": 0.37062501907348633, + "learning_rate": 0.00011361019736842104, + "loss": 0.5004, + "step": 1782 + }, + { + "epoch": 1.9780890306476215, + "grad_norm": 0.28763630986213684, + "learning_rate": 0.00011348684210526314, + "loss": 0.5367, + "step": 1783 + }, + { + "epoch": 1.9791984468173625, + "grad_norm": 0.34978562593460083, + "learning_rate": 0.00011336348684210526, + "loss": 0.5041, + "step": 1784 + }, + { + "epoch": 1.980307862987103, + "grad_norm": 0.2940448820590973, + "learning_rate": 0.00011324013157894736, + "loss": 0.5121, + "step": 1785 + }, + { + "epoch": 1.9814172791568438, + "grad_norm": 0.24150650203227997, + "learning_rate": 0.00011311677631578946, + "loss": 0.4354, + "step": 1786 + }, + { + "epoch": 1.9825266953265843, + "grad_norm": 0.24752016365528107, + "learning_rate": 0.00011299342105263157, + "loss": 0.3303, + "step": 1787 + }, + { + "epoch": 1.983636111496325, + "grad_norm": 0.2988849878311157, + "learning_rate": 0.00011287006578947366, + "loss": 0.5873, + "step": 1788 + }, + { + "epoch": 1.9847455276660657, + "grad_norm": 0.548851490020752, + "learning_rate": 0.00011274671052631578, + "loss": 0.6516, + "step": 1789 + }, + { + "epoch": 1.9858549438358064, + "grad_norm": 0.3005162477493286, + "learning_rate": 0.00011262335526315789, + "loss": 0.4203, + "step": 1790 + }, + { + "epoch": 1.986964360005547, + "grad_norm": 0.3434782922267914, + "learning_rate": 0.0001125, + "loss": 0.6023, + "step": 1791 + }, + { + "epoch": 1.9880737761752878, + "grad_norm": 0.27085399627685547, + "learning_rate": 0.00011237664473684209, + "loss": 0.4336, + "step": 1792 + }, + { + "epoch": 1.9891831923450285, + "grad_norm": 0.24659699201583862, + "learning_rate": 0.0001122532894736842, + "loss": 0.4351, + "step": 1793 + }, + { + "epoch": 1.990292608514769, + "grad_norm": 0.2878054976463318, + "learning_rate": 0.00011212993421052632, + "loss": 0.3109, + "step": 1794 + }, + { + "epoch": 1.9914020246845099, + "grad_norm": 0.2754107117652893, + "learning_rate": 0.00011200657894736841, + "loss": 0.5065, + "step": 1795 + }, + { + "epoch": 1.9925114408542504, + "grad_norm": 0.31422141194343567, + "learning_rate": 0.00011188322368421052, + "loss": 0.5294, + "step": 1796 + }, + { + "epoch": 1.9936208570239913, + "grad_norm": 0.2437220960855484, + "learning_rate": 0.00011175986842105262, + "loss": 0.4636, + "step": 1797 + }, + { + "epoch": 1.9947302731937318, + "grad_norm": 0.3113705515861511, + "learning_rate": 0.00011163651315789472, + "loss": 0.4084, + "step": 1798 + }, + { + "epoch": 1.9958396893634724, + "grad_norm": 0.2959713935852051, + "learning_rate": 0.00011151315789473684, + "loss": 0.6161, + "step": 1799 + }, + { + "epoch": 1.9969491055332131, + "grad_norm": 0.29905256628990173, + "learning_rate": 0.00011138980263157894, + "loss": 0.3853, + "step": 1800 + }, + { + "epoch": 1.9980585217029538, + "grad_norm": 0.3135545551776886, + "learning_rate": 0.00011126644736842104, + "loss": 0.5653, + "step": 1801 + }, + { + "epoch": 1.9991679378726945, + "grad_norm": 0.3632647395133972, + "learning_rate": 0.00011114309210526314, + "loss": 0.3835, + "step": 1802 + }, + { + "epoch": 2.000277354042435, + "grad_norm": 0.3683667480945587, + "learning_rate": 0.00011101973684210526, + "loss": 0.4497, + "step": 1803 + }, + { + "epoch": 2.001386770212176, + "grad_norm": 0.26978781819343567, + "learning_rate": 0.00011089638157894736, + "loss": 0.584, + "step": 1804 + }, + { + "epoch": 2.0024961863819164, + "grad_norm": 0.2260834127664566, + "learning_rate": 0.00011077302631578946, + "loss": 0.5175, + "step": 1805 + }, + { + "epoch": 2.0036056025516573, + "grad_norm": 0.2791745364665985, + "learning_rate": 0.00011064967105263157, + "loss": 0.5489, + "step": 1806 + }, + { + "epoch": 2.004715018721398, + "grad_norm": 0.4569042921066284, + "learning_rate": 0.00011052631578947366, + "loss": 0.4872, + "step": 1807 + }, + { + "epoch": 2.0058244348911387, + "grad_norm": 0.2634184956550598, + "learning_rate": 0.00011040296052631578, + "loss": 0.4137, + "step": 1808 + }, + { + "epoch": 2.006933851060879, + "grad_norm": 0.3725602626800537, + "learning_rate": 0.00011027960526315789, + "loss": 0.3663, + "step": 1809 + }, + { + "epoch": 2.00804326723062, + "grad_norm": 0.19589465856552124, + "learning_rate": 0.00011015624999999998, + "loss": 0.3241, + "step": 1810 + }, + { + "epoch": 2.0091526834003606, + "grad_norm": 0.2446906864643097, + "learning_rate": 0.00011003289473684209, + "loss": 0.3818, + "step": 1811 + }, + { + "epoch": 2.010262099570101, + "grad_norm": 0.2932548224925995, + "learning_rate": 0.0001099095394736842, + "loss": 0.4009, + "step": 1812 + }, + { + "epoch": 2.011371515739842, + "grad_norm": 0.23010744154453278, + "learning_rate": 0.00010978618421052632, + "loss": 0.5868, + "step": 1813 + }, + { + "epoch": 2.0124809319095824, + "grad_norm": 0.24582666158676147, + "learning_rate": 0.00010966282894736841, + "loss": 0.2866, + "step": 1814 + }, + { + "epoch": 2.0135903480793234, + "grad_norm": 0.2688146233558655, + "learning_rate": 0.00010953947368421052, + "loss": 0.3321, + "step": 1815 + }, + { + "epoch": 2.014699764249064, + "grad_norm": 0.35448578000068665, + "learning_rate": 0.00010941611842105262, + "loss": 0.5611, + "step": 1816 + }, + { + "epoch": 2.0158091804188047, + "grad_norm": 0.3180113732814789, + "learning_rate": 0.00010929276315789472, + "loss": 0.3527, + "step": 1817 + }, + { + "epoch": 2.016918596588545, + "grad_norm": 0.27800217270851135, + "learning_rate": 0.00010916940789473684, + "loss": 0.339, + "step": 1818 + }, + { + "epoch": 2.018028012758286, + "grad_norm": 0.34227412939071655, + "learning_rate": 0.00010904605263157894, + "loss": 0.4474, + "step": 1819 + }, + { + "epoch": 2.0191374289280266, + "grad_norm": 0.3180390000343323, + "learning_rate": 0.00010892269736842104, + "loss": 0.5412, + "step": 1820 + }, + { + "epoch": 2.020246845097767, + "grad_norm": 0.4531157314777374, + "learning_rate": 0.00010879934210526314, + "loss": 0.4999, + "step": 1821 + }, + { + "epoch": 2.021356261267508, + "grad_norm": 0.3139798045158386, + "learning_rate": 0.00010867598684210526, + "loss": 0.3725, + "step": 1822 + }, + { + "epoch": 2.0224656774372485, + "grad_norm": 0.2892252206802368, + "learning_rate": 0.00010855263157894736, + "loss": 0.462, + "step": 1823 + }, + { + "epoch": 2.0235750936069894, + "grad_norm": 0.34606751799583435, + "learning_rate": 0.00010842927631578946, + "loss": 0.2938, + "step": 1824 + }, + { + "epoch": 2.02468450977673, + "grad_norm": 0.3713940680027008, + "learning_rate": 0.00010830592105263157, + "loss": 0.3484, + "step": 1825 + }, + { + "epoch": 2.025793925946471, + "grad_norm": 0.2926501929759979, + "learning_rate": 0.00010818256578947366, + "loss": 0.3333, + "step": 1826 + }, + { + "epoch": 2.0269033421162113, + "grad_norm": 0.29994428157806396, + "learning_rate": 0.00010805921052631578, + "loss": 0.5876, + "step": 1827 + }, + { + "epoch": 2.028012758285952, + "grad_norm": 0.26852184534072876, + "learning_rate": 0.00010793585526315789, + "loss": 0.3795, + "step": 1828 + }, + { + "epoch": 2.0291221744556927, + "grad_norm": 0.3186289072036743, + "learning_rate": 0.00010781249999999998, + "loss": 0.4261, + "step": 1829 + }, + { + "epoch": 2.0302315906254336, + "grad_norm": 0.22164680063724518, + "learning_rate": 0.00010768914473684209, + "loss": 0.4383, + "step": 1830 + }, + { + "epoch": 2.031341006795174, + "grad_norm": 0.4684840142726898, + "learning_rate": 0.0001075657894736842, + "loss": 0.4621, + "step": 1831 + }, + { + "epoch": 2.0324504229649145, + "grad_norm": 0.27373453974723816, + "learning_rate": 0.00010744243421052632, + "loss": 0.4259, + "step": 1832 + }, + { + "epoch": 2.0335598391346554, + "grad_norm": 0.3046364188194275, + "learning_rate": 0.00010731907894736841, + "loss": 0.5675, + "step": 1833 + }, + { + "epoch": 2.034669255304396, + "grad_norm": 0.2961323857307434, + "learning_rate": 0.00010719572368421052, + "loss": 0.3838, + "step": 1834 + }, + { + "epoch": 2.035778671474137, + "grad_norm": 0.3641231656074524, + "learning_rate": 0.00010707236842105261, + "loss": 0.2031, + "step": 1835 + }, + { + "epoch": 2.0368880876438773, + "grad_norm": 0.38065147399902344, + "learning_rate": 0.00010694901315789472, + "loss": 0.3094, + "step": 1836 + }, + { + "epoch": 2.037997503813618, + "grad_norm": 0.3846987783908844, + "learning_rate": 0.00010682565789473684, + "loss": 0.4203, + "step": 1837 + }, + { + "epoch": 2.0391069199833587, + "grad_norm": 0.2999848425388336, + "learning_rate": 0.00010670230263157895, + "loss": 0.3044, + "step": 1838 + }, + { + "epoch": 2.0402163361530996, + "grad_norm": 0.4001493453979492, + "learning_rate": 0.00010657894736842104, + "loss": 0.3758, + "step": 1839 + }, + { + "epoch": 2.04132575232284, + "grad_norm": 0.42989227175712585, + "learning_rate": 0.00010645559210526315, + "loss": 0.4854, + "step": 1840 + }, + { + "epoch": 2.042435168492581, + "grad_norm": 0.3566846549510956, + "learning_rate": 0.00010633223684210527, + "loss": 0.4105, + "step": 1841 + }, + { + "epoch": 2.0435445846623215, + "grad_norm": 0.41669943928718567, + "learning_rate": 0.00010620888157894736, + "loss": 0.4248, + "step": 1842 + }, + { + "epoch": 2.044654000832062, + "grad_norm": 0.31254488229751587, + "learning_rate": 0.00010608552631578947, + "loss": 0.3155, + "step": 1843 + }, + { + "epoch": 2.045763417001803, + "grad_norm": 0.2741456925868988, + "learning_rate": 0.00010596217105263157, + "loss": 0.4663, + "step": 1844 + }, + { + "epoch": 2.0468728331715433, + "grad_norm": 0.40784788131713867, + "learning_rate": 0.00010583881578947367, + "loss": 0.2822, + "step": 1845 + }, + { + "epoch": 2.0479822493412843, + "grad_norm": 0.3757185935974121, + "learning_rate": 0.00010571546052631579, + "loss": 0.4443, + "step": 1846 + }, + { + "epoch": 2.0490916655110247, + "grad_norm": 0.38732078671455383, + "learning_rate": 0.00010559210526315789, + "loss": 0.4025, + "step": 1847 + }, + { + "epoch": 2.0502010816807656, + "grad_norm": 0.34661343693733215, + "learning_rate": 0.00010546874999999999, + "loss": 0.3945, + "step": 1848 + }, + { + "epoch": 2.051310497850506, + "grad_norm": 0.41781237721443176, + "learning_rate": 0.00010534539473684209, + "loss": 0.3046, + "step": 1849 + }, + { + "epoch": 2.052419914020247, + "grad_norm": 0.3018251955509186, + "learning_rate": 0.0001052220394736842, + "loss": 0.3751, + "step": 1850 + }, + { + "epoch": 2.0535293301899875, + "grad_norm": 0.2182953655719757, + "learning_rate": 0.00010509868421052632, + "loss": 0.4354, + "step": 1851 + }, + { + "epoch": 2.054638746359728, + "grad_norm": 0.48397496342658997, + "learning_rate": 0.00010497532894736841, + "loss": 0.362, + "step": 1852 + }, + { + "epoch": 2.055748162529469, + "grad_norm": 0.3845345079898834, + "learning_rate": 0.00010485197368421052, + "loss": 0.3564, + "step": 1853 + }, + { + "epoch": 2.0568575786992094, + "grad_norm": 0.2810097932815552, + "learning_rate": 0.00010472861842105261, + "loss": 0.2759, + "step": 1854 + }, + { + "epoch": 2.0579669948689503, + "grad_norm": 0.27831992506980896, + "learning_rate": 0.00010460526315789472, + "loss": 0.4981, + "step": 1855 + }, + { + "epoch": 2.0590764110386908, + "grad_norm": 0.48267292976379395, + "learning_rate": 0.00010448190789473684, + "loss": 0.5129, + "step": 1856 + }, + { + "epoch": 2.0601858272084317, + "grad_norm": 0.3351428508758545, + "learning_rate": 0.00010435855263157895, + "loss": 0.4921, + "step": 1857 + }, + { + "epoch": 2.061295243378172, + "grad_norm": 0.3631199598312378, + "learning_rate": 0.00010423519736842104, + "loss": 0.3983, + "step": 1858 + }, + { + "epoch": 2.062404659547913, + "grad_norm": 0.369219571352005, + "learning_rate": 0.00010411184210526315, + "loss": 0.4716, + "step": 1859 + }, + { + "epoch": 2.0635140757176536, + "grad_norm": 0.43210768699645996, + "learning_rate": 0.00010398848684210527, + "loss": 0.3501, + "step": 1860 + }, + { + "epoch": 2.0646234918873945, + "grad_norm": 0.41098493337631226, + "learning_rate": 0.00010386513157894736, + "loss": 0.4026, + "step": 1861 + }, + { + "epoch": 2.065732908057135, + "grad_norm": 0.36239397525787354, + "learning_rate": 0.00010374177631578947, + "loss": 0.297, + "step": 1862 + }, + { + "epoch": 2.0668423242268754, + "grad_norm": 0.41763097047805786, + "learning_rate": 0.00010361842105263157, + "loss": 0.3034, + "step": 1863 + }, + { + "epoch": 2.0679517403966163, + "grad_norm": 0.37006324529647827, + "learning_rate": 0.00010349506578947367, + "loss": 0.5594, + "step": 1864 + }, + { + "epoch": 2.069061156566357, + "grad_norm": 0.4518885612487793, + "learning_rate": 0.00010337171052631579, + "loss": 0.5642, + "step": 1865 + }, + { + "epoch": 2.0701705727360977, + "grad_norm": 0.3855383098125458, + "learning_rate": 0.00010324835526315789, + "loss": 0.2811, + "step": 1866 + }, + { + "epoch": 2.071279988905838, + "grad_norm": 0.3048069477081299, + "learning_rate": 0.00010312499999999999, + "loss": 0.3574, + "step": 1867 + }, + { + "epoch": 2.072389405075579, + "grad_norm": 0.28566887974739075, + "learning_rate": 0.00010300164473684209, + "loss": 0.2174, + "step": 1868 + }, + { + "epoch": 2.0734988212453196, + "grad_norm": 0.394229918718338, + "learning_rate": 0.0001028782894736842, + "loss": 0.3636, + "step": 1869 + }, + { + "epoch": 2.0746082374150605, + "grad_norm": 0.31521254777908325, + "learning_rate": 0.00010275493421052632, + "loss": 0.5267, + "step": 1870 + }, + { + "epoch": 2.075717653584801, + "grad_norm": 0.3841816782951355, + "learning_rate": 0.00010263157894736841, + "loss": 0.4475, + "step": 1871 + }, + { + "epoch": 2.0768270697545415, + "grad_norm": 0.3173518776893616, + "learning_rate": 0.00010250822368421052, + "loss": 0.3575, + "step": 1872 + }, + { + "epoch": 2.0779364859242824, + "grad_norm": 0.4765770435333252, + "learning_rate": 0.00010238486842105261, + "loss": 0.3616, + "step": 1873 + }, + { + "epoch": 2.079045902094023, + "grad_norm": 0.3841620683670044, + "learning_rate": 0.00010226151315789472, + "loss": 0.4769, + "step": 1874 + }, + { + "epoch": 2.0801553182637638, + "grad_norm": 0.3756863474845886, + "learning_rate": 0.00010213815789473684, + "loss": 0.3619, + "step": 1875 + }, + { + "epoch": 2.0812647344335042, + "grad_norm": 0.29783549904823303, + "learning_rate": 0.00010201480263157895, + "loss": 0.5892, + "step": 1876 + }, + { + "epoch": 2.082374150603245, + "grad_norm": 0.3997184634208679, + "learning_rate": 0.00010189144736842104, + "loss": 0.34, + "step": 1877 + }, + { + "epoch": 2.0834835667729856, + "grad_norm": 0.31789451837539673, + "learning_rate": 0.00010176809210526315, + "loss": 0.3102, + "step": 1878 + }, + { + "epoch": 2.0845929829427265, + "grad_norm": 0.3776637017726898, + "learning_rate": 0.00010164473684210527, + "loss": 0.3004, + "step": 1879 + }, + { + "epoch": 2.085702399112467, + "grad_norm": 0.3050936162471771, + "learning_rate": 0.00010152138157894736, + "loss": 0.3768, + "step": 1880 + }, + { + "epoch": 2.086811815282208, + "grad_norm": 0.33904218673706055, + "learning_rate": 0.00010139802631578947, + "loss": 0.461, + "step": 1881 + }, + { + "epoch": 2.0879212314519484, + "grad_norm": 0.5047959089279175, + "learning_rate": 0.00010127467105263157, + "loss": 0.4089, + "step": 1882 + }, + { + "epoch": 2.089030647621689, + "grad_norm": 0.3899175822734833, + "learning_rate": 0.00010115131578947367, + "loss": 0.5338, + "step": 1883 + }, + { + "epoch": 2.09014006379143, + "grad_norm": 0.5013115406036377, + "learning_rate": 0.00010102796052631579, + "loss": 0.4203, + "step": 1884 + }, + { + "epoch": 2.0912494799611703, + "grad_norm": 0.2986677289009094, + "learning_rate": 0.0001009046052631579, + "loss": 0.3183, + "step": 1885 + }, + { + "epoch": 2.092358896130911, + "grad_norm": 0.26325130462646484, + "learning_rate": 0.00010078124999999999, + "loss": 0.3204, + "step": 1886 + }, + { + "epoch": 2.0934683123006517, + "grad_norm": 0.333397775888443, + "learning_rate": 0.0001006578947368421, + "loss": 0.292, + "step": 1887 + }, + { + "epoch": 2.0945777284703926, + "grad_norm": 0.40086430311203003, + "learning_rate": 0.0001005345394736842, + "loss": 0.3311, + "step": 1888 + }, + { + "epoch": 2.095687144640133, + "grad_norm": 0.3059875965118408, + "learning_rate": 0.0001004111842105263, + "loss": 0.3572, + "step": 1889 + }, + { + "epoch": 2.096796560809874, + "grad_norm": 0.40417563915252686, + "learning_rate": 0.00010028782894736841, + "loss": 0.3889, + "step": 1890 + }, + { + "epoch": 2.0979059769796145, + "grad_norm": 0.45205605030059814, + "learning_rate": 0.00010016447368421052, + "loss": 0.5045, + "step": 1891 + }, + { + "epoch": 2.0990153931493554, + "grad_norm": 0.28313323855400085, + "learning_rate": 0.00010004111842105261, + "loss": 0.4171, + "step": 1892 + }, + { + "epoch": 2.100124809319096, + "grad_norm": 0.41001975536346436, + "learning_rate": 9.991776315789472e-05, + "loss": 0.2781, + "step": 1893 + }, + { + "epoch": 2.1012342254888363, + "grad_norm": 0.2708085775375366, + "learning_rate": 9.979440789473684e-05, + "loss": 0.3806, + "step": 1894 + }, + { + "epoch": 2.1023436416585772, + "grad_norm": 0.3741215467453003, + "learning_rate": 9.967105263157895e-05, + "loss": 0.2874, + "step": 1895 + }, + { + "epoch": 2.1034530578283177, + "grad_norm": 0.2658732831478119, + "learning_rate": 9.954769736842104e-05, + "loss": 0.3757, + "step": 1896 + }, + { + "epoch": 2.1045624739980586, + "grad_norm": 0.3492313623428345, + "learning_rate": 9.942434210526315e-05, + "loss": 0.4064, + "step": 1897 + }, + { + "epoch": 2.105671890167799, + "grad_norm": 0.34106922149658203, + "learning_rate": 9.930098684210527e-05, + "loss": 0.2377, + "step": 1898 + }, + { + "epoch": 2.10678130633754, + "grad_norm": 0.3154791593551636, + "learning_rate": 9.917763157894736e-05, + "loss": 0.4634, + "step": 1899 + }, + { + "epoch": 2.1078907225072805, + "grad_norm": 0.3917715549468994, + "learning_rate": 9.905427631578947e-05, + "loss": 0.4126, + "step": 1900 + }, + { + "epoch": 2.1090001386770214, + "grad_norm": 0.32718661427497864, + "learning_rate": 9.893092105263157e-05, + "loss": 0.3976, + "step": 1901 + }, + { + "epoch": 2.110109554846762, + "grad_norm": 0.2590242028236389, + "learning_rate": 9.880756578947367e-05, + "loss": 0.3127, + "step": 1902 + }, + { + "epoch": 2.1112189710165024, + "grad_norm": 0.44344115257263184, + "learning_rate": 9.868421052631579e-05, + "loss": 0.3864, + "step": 1903 + }, + { + "epoch": 2.1123283871862433, + "grad_norm": 0.425987184047699, + "learning_rate": 9.85608552631579e-05, + "loss": 0.4428, + "step": 1904 + }, + { + "epoch": 2.1134378033559837, + "grad_norm": 0.5364298224449158, + "learning_rate": 9.843749999999999e-05, + "loss": 0.3975, + "step": 1905 + }, + { + "epoch": 2.1145472195257247, + "grad_norm": 0.4158439338207245, + "learning_rate": 9.83141447368421e-05, + "loss": 0.499, + "step": 1906 + }, + { + "epoch": 2.115656635695465, + "grad_norm": 0.30733615159988403, + "learning_rate": 9.81907894736842e-05, + "loss": 0.3568, + "step": 1907 + }, + { + "epoch": 2.116766051865206, + "grad_norm": 0.2557796835899353, + "learning_rate": 9.806743421052631e-05, + "loss": 0.5255, + "step": 1908 + }, + { + "epoch": 2.1178754680349465, + "grad_norm": 0.36079320311546326, + "learning_rate": 9.794407894736841e-05, + "loss": 0.4101, + "step": 1909 + }, + { + "epoch": 2.1189848842046874, + "grad_norm": 0.3143673837184906, + "learning_rate": 9.782072368421052e-05, + "loss": 0.4313, + "step": 1910 + }, + { + "epoch": 2.120094300374428, + "grad_norm": 0.44542989134788513, + "learning_rate": 9.769736842105261e-05, + "loss": 0.3331, + "step": 1911 + }, + { + "epoch": 2.121203716544169, + "grad_norm": 0.28402179479599, + "learning_rate": 9.757401315789472e-05, + "loss": 0.3088, + "step": 1912 + }, + { + "epoch": 2.1223131327139093, + "grad_norm": 0.4471670687198639, + "learning_rate": 9.745065789473684e-05, + "loss": 0.4129, + "step": 1913 + }, + { + "epoch": 2.12342254888365, + "grad_norm": 0.29794514179229736, + "learning_rate": 9.732730263157893e-05, + "loss": 0.3326, + "step": 1914 + }, + { + "epoch": 2.1245319650533907, + "grad_norm": 0.37658748030662537, + "learning_rate": 9.720394736842104e-05, + "loss": 0.3271, + "step": 1915 + }, + { + "epoch": 2.125641381223131, + "grad_norm": 0.31780439615249634, + "learning_rate": 9.708059210526315e-05, + "loss": 0.3144, + "step": 1916 + }, + { + "epoch": 2.126750797392872, + "grad_norm": 0.3516882359981537, + "learning_rate": 9.695723684210527e-05, + "loss": 0.1996, + "step": 1917 + }, + { + "epoch": 2.1278602135626126, + "grad_norm": 0.3291231095790863, + "learning_rate": 9.683388157894736e-05, + "loss": 0.7135, + "step": 1918 + }, + { + "epoch": 2.1289696297323535, + "grad_norm": 0.4481741189956665, + "learning_rate": 9.671052631578947e-05, + "loss": 0.2855, + "step": 1919 + }, + { + "epoch": 2.130079045902094, + "grad_norm": 0.43044859170913696, + "learning_rate": 9.658717105263157e-05, + "loss": 0.4145, + "step": 1920 + }, + { + "epoch": 2.131188462071835, + "grad_norm": 0.3299560546875, + "learning_rate": 9.646381578947367e-05, + "loss": 0.5771, + "step": 1921 + }, + { + "epoch": 2.1322978782415754, + "grad_norm": 0.4290536940097809, + "learning_rate": 9.634046052631579e-05, + "loss": 0.492, + "step": 1922 + }, + { + "epoch": 2.133407294411316, + "grad_norm": 0.344137966632843, + "learning_rate": 9.62171052631579e-05, + "loss": 0.3803, + "step": 1923 + }, + { + "epoch": 2.1345167105810567, + "grad_norm": 0.511370062828064, + "learning_rate": 9.609374999999999e-05, + "loss": 0.3662, + "step": 1924 + }, + { + "epoch": 2.135626126750797, + "grad_norm": 0.4554339349269867, + "learning_rate": 9.59703947368421e-05, + "loss": 0.3805, + "step": 1925 + }, + { + "epoch": 2.136735542920538, + "grad_norm": 0.36867454648017883, + "learning_rate": 9.58470394736842e-05, + "loss": 0.4913, + "step": 1926 + }, + { + "epoch": 2.1378449590902786, + "grad_norm": 0.36078619956970215, + "learning_rate": 9.572368421052631e-05, + "loss": 0.3083, + "step": 1927 + }, + { + "epoch": 2.1389543752600195, + "grad_norm": 0.369831919670105, + "learning_rate": 9.560032894736841e-05, + "loss": 0.2677, + "step": 1928 + }, + { + "epoch": 2.14006379142976, + "grad_norm": 0.4087238013744354, + "learning_rate": 9.547697368421052e-05, + "loss": 0.4678, + "step": 1929 + }, + { + "epoch": 2.141173207599501, + "grad_norm": 0.5325888991355896, + "learning_rate": 9.535361842105261e-05, + "loss": 0.3679, + "step": 1930 + }, + { + "epoch": 2.1422826237692414, + "grad_norm": 0.281781405210495, + "learning_rate": 9.523026315789472e-05, + "loss": 0.4423, + "step": 1931 + }, + { + "epoch": 2.1433920399389823, + "grad_norm": 0.29980483651161194, + "learning_rate": 9.510690789473684e-05, + "loss": 0.3535, + "step": 1932 + }, + { + "epoch": 2.144501456108723, + "grad_norm": 0.39666828513145447, + "learning_rate": 9.498355263157893e-05, + "loss": 0.5298, + "step": 1933 + }, + { + "epoch": 2.1456108722784633, + "grad_norm": 0.4402129352092743, + "learning_rate": 9.486019736842104e-05, + "loss": 0.4866, + "step": 1934 + }, + { + "epoch": 2.146720288448204, + "grad_norm": 0.4045298397541046, + "learning_rate": 9.473684210526315e-05, + "loss": 0.5098, + "step": 1935 + }, + { + "epoch": 2.1478297046179446, + "grad_norm": 0.3659813404083252, + "learning_rate": 9.461348684210527e-05, + "loss": 0.3095, + "step": 1936 + }, + { + "epoch": 2.1489391207876856, + "grad_norm": 0.3063139021396637, + "learning_rate": 9.449013157894736e-05, + "loss": 0.2183, + "step": 1937 + }, + { + "epoch": 2.150048536957426, + "grad_norm": 0.4679979979991913, + "learning_rate": 9.436677631578947e-05, + "loss": 0.514, + "step": 1938 + }, + { + "epoch": 2.151157953127167, + "grad_norm": 0.31606557965278625, + "learning_rate": 9.424342105263156e-05, + "loss": 0.4071, + "step": 1939 + }, + { + "epoch": 2.1522673692969074, + "grad_norm": 0.3824010193347931, + "learning_rate": 9.412006578947367e-05, + "loss": 0.3155, + "step": 1940 + }, + { + "epoch": 2.1533767854666483, + "grad_norm": 0.28109651803970337, + "learning_rate": 9.399671052631579e-05, + "loss": 0.344, + "step": 1941 + }, + { + "epoch": 2.154486201636389, + "grad_norm": 0.3306637704372406, + "learning_rate": 9.38733552631579e-05, + "loss": 0.1947, + "step": 1942 + }, + { + "epoch": 2.1555956178061297, + "grad_norm": 0.32935014367103577, + "learning_rate": 9.374999999999999e-05, + "loss": 0.2445, + "step": 1943 + }, + { + "epoch": 2.15670503397587, + "grad_norm": 0.33338576555252075, + "learning_rate": 9.36266447368421e-05, + "loss": 0.3996, + "step": 1944 + }, + { + "epoch": 2.1578144501456107, + "grad_norm": 0.25957128405570984, + "learning_rate": 9.35032894736842e-05, + "loss": 0.3473, + "step": 1945 + }, + { + "epoch": 2.1589238663153516, + "grad_norm": 0.48169735074043274, + "learning_rate": 9.337993421052631e-05, + "loss": 0.329, + "step": 1946 + }, + { + "epoch": 2.160033282485092, + "grad_norm": 0.4141751527786255, + "learning_rate": 9.325657894736842e-05, + "loss": 0.2981, + "step": 1947 + }, + { + "epoch": 2.161142698654833, + "grad_norm": 0.4750854969024658, + "learning_rate": 9.313322368421052e-05, + "loss": 0.5787, + "step": 1948 + }, + { + "epoch": 2.1622521148245735, + "grad_norm": 0.4501727819442749, + "learning_rate": 9.300986842105262e-05, + "loss": 0.2925, + "step": 1949 + }, + { + "epoch": 2.1633615309943144, + "grad_norm": 0.5955410599708557, + "learning_rate": 9.288651315789472e-05, + "loss": 0.3785, + "step": 1950 + }, + { + "epoch": 2.164470947164055, + "grad_norm": 0.36002209782600403, + "learning_rate": 9.276315789473684e-05, + "loss": 0.4342, + "step": 1951 + }, + { + "epoch": 2.1655803633337958, + "grad_norm": 0.4657028615474701, + "learning_rate": 9.263980263157894e-05, + "loss": 0.3671, + "step": 1952 + }, + { + "epoch": 2.1666897795035363, + "grad_norm": 0.7127841711044312, + "learning_rate": 9.251644736842104e-05, + "loss": 0.3303, + "step": 1953 + }, + { + "epoch": 2.167799195673277, + "grad_norm": 0.4676034152507782, + "learning_rate": 9.239309210526315e-05, + "loss": 0.507, + "step": 1954 + }, + { + "epoch": 2.1689086118430176, + "grad_norm": 0.6187905073165894, + "learning_rate": 9.226973684210527e-05, + "loss": 0.3498, + "step": 1955 + }, + { + "epoch": 2.170018028012758, + "grad_norm": 0.4146270751953125, + "learning_rate": 9.214638157894736e-05, + "loss": 0.4633, + "step": 1956 + }, + { + "epoch": 2.171127444182499, + "grad_norm": 0.3486730456352234, + "learning_rate": 9.202302631578947e-05, + "loss": 0.4834, + "step": 1957 + }, + { + "epoch": 2.1722368603522395, + "grad_norm": 0.30160388350486755, + "learning_rate": 9.189967105263156e-05, + "loss": 0.4886, + "step": 1958 + }, + { + "epoch": 2.1733462765219804, + "grad_norm": 0.571941077709198, + "learning_rate": 9.177631578947367e-05, + "loss": 0.6495, + "step": 1959 + }, + { + "epoch": 2.174455692691721, + "grad_norm": 0.2943151295185089, + "learning_rate": 9.165296052631579e-05, + "loss": 0.4508, + "step": 1960 + }, + { + "epoch": 2.175565108861462, + "grad_norm": 0.3195703625679016, + "learning_rate": 9.15296052631579e-05, + "loss": 0.4173, + "step": 1961 + }, + { + "epoch": 2.1766745250312023, + "grad_norm": 0.3255450427532196, + "learning_rate": 9.140624999999999e-05, + "loss": 0.2631, + "step": 1962 + }, + { + "epoch": 2.177783941200943, + "grad_norm": 0.34725460410118103, + "learning_rate": 9.12828947368421e-05, + "loss": 0.2789, + "step": 1963 + }, + { + "epoch": 2.1788933573706837, + "grad_norm": 0.284347265958786, + "learning_rate": 9.115953947368419e-05, + "loss": 0.3208, + "step": 1964 + }, + { + "epoch": 2.180002773540424, + "grad_norm": 0.3235912322998047, + "learning_rate": 9.103618421052631e-05, + "loss": 0.5076, + "step": 1965 + }, + { + "epoch": 2.181112189710165, + "grad_norm": 0.37382572889328003, + "learning_rate": 9.091282894736842e-05, + "loss": 0.4009, + "step": 1966 + }, + { + "epoch": 2.1822216058799055, + "grad_norm": 0.3380100727081299, + "learning_rate": 9.078947368421052e-05, + "loss": 0.3682, + "step": 1967 + }, + { + "epoch": 2.1833310220496465, + "grad_norm": 0.36504095792770386, + "learning_rate": 9.066611842105262e-05, + "loss": 0.4735, + "step": 1968 + }, + { + "epoch": 2.184440438219387, + "grad_norm": 0.44653430581092834, + "learning_rate": 9.054276315789472e-05, + "loss": 0.8669, + "step": 1969 + }, + { + "epoch": 2.185549854389128, + "grad_norm": 0.38333860039711, + "learning_rate": 9.041940789473684e-05, + "loss": 0.2508, + "step": 1970 + }, + { + "epoch": 2.1866592705588683, + "grad_norm": 0.413216233253479, + "learning_rate": 9.029605263157894e-05, + "loss": 0.5268, + "step": 1971 + }, + { + "epoch": 2.1877686867286092, + "grad_norm": 0.4521336257457733, + "learning_rate": 9.017269736842104e-05, + "loss": 0.62, + "step": 1972 + }, + { + "epoch": 2.1888781028983497, + "grad_norm": 0.39028438925743103, + "learning_rate": 9.004934210526315e-05, + "loss": 0.4885, + "step": 1973 + }, + { + "epoch": 2.18998751906809, + "grad_norm": 0.4401836693286896, + "learning_rate": 8.992598684210527e-05, + "loss": 0.3722, + "step": 1974 + }, + { + "epoch": 2.191096935237831, + "grad_norm": 0.4004587233066559, + "learning_rate": 8.980263157894736e-05, + "loss": 0.2697, + "step": 1975 + }, + { + "epoch": 2.1922063514075716, + "grad_norm": 0.5189459919929504, + "learning_rate": 8.967927631578947e-05, + "loss": 0.4762, + "step": 1976 + }, + { + "epoch": 2.1933157675773125, + "grad_norm": 0.5282573103904724, + "learning_rate": 8.955592105263156e-05, + "loss": 0.4813, + "step": 1977 + }, + { + "epoch": 2.194425183747053, + "grad_norm": 0.3748975098133087, + "learning_rate": 8.943256578947367e-05, + "loss": 0.4131, + "step": 1978 + }, + { + "epoch": 2.195534599916794, + "grad_norm": 0.3851288855075836, + "learning_rate": 8.930921052631579e-05, + "loss": 0.2714, + "step": 1979 + }, + { + "epoch": 2.1966440160865344, + "grad_norm": 0.2807680368423462, + "learning_rate": 8.91858552631579e-05, + "loss": 0.4121, + "step": 1980 + }, + { + "epoch": 2.1977534322562753, + "grad_norm": 0.42031747102737427, + "learning_rate": 8.906249999999999e-05, + "loss": 0.3362, + "step": 1981 + }, + { + "epoch": 2.1988628484260158, + "grad_norm": 0.33740946650505066, + "learning_rate": 8.89391447368421e-05, + "loss": 0.2981, + "step": 1982 + }, + { + "epoch": 2.1999722645957567, + "grad_norm": 0.27107852697372437, + "learning_rate": 8.881578947368419e-05, + "loss": 0.3691, + "step": 1983 + }, + { + "epoch": 2.201081680765497, + "grad_norm": 0.36126452684402466, + "learning_rate": 8.869243421052631e-05, + "loss": 0.4095, + "step": 1984 + }, + { + "epoch": 2.2021910969352376, + "grad_norm": 0.517387330532074, + "learning_rate": 8.856907894736842e-05, + "loss": 0.3078, + "step": 1985 + }, + { + "epoch": 2.2033005131049785, + "grad_norm": 0.5038511157035828, + "learning_rate": 8.844572368421052e-05, + "loss": 0.2611, + "step": 1986 + }, + { + "epoch": 2.204409929274719, + "grad_norm": 0.35804232954978943, + "learning_rate": 8.832236842105262e-05, + "loss": 0.37, + "step": 1987 + }, + { + "epoch": 2.20551934544446, + "grad_norm": 0.47403684258461, + "learning_rate": 8.819901315789472e-05, + "loss": 0.4773, + "step": 1988 + }, + { + "epoch": 2.2066287616142004, + "grad_norm": 0.38205355405807495, + "learning_rate": 8.807565789473684e-05, + "loss": 0.3112, + "step": 1989 + }, + { + "epoch": 2.2077381777839413, + "grad_norm": 0.39398112893104553, + "learning_rate": 8.795230263157894e-05, + "loss": 0.4293, + "step": 1990 + }, + { + "epoch": 2.208847593953682, + "grad_norm": 0.39993610978126526, + "learning_rate": 8.782894736842104e-05, + "loss": 0.5623, + "step": 1991 + }, + { + "epoch": 2.2099570101234227, + "grad_norm": 0.4437258243560791, + "learning_rate": 8.770559210526315e-05, + "loss": 0.5582, + "step": 1992 + }, + { + "epoch": 2.211066426293163, + "grad_norm": 0.3316510021686554, + "learning_rate": 8.758223684210526e-05, + "loss": 0.1147, + "step": 1993 + }, + { + "epoch": 2.212175842462904, + "grad_norm": 0.4257866442203522, + "learning_rate": 8.745888157894736e-05, + "loss": 0.2057, + "step": 1994 + }, + { + "epoch": 2.2132852586326446, + "grad_norm": 0.41767770051956177, + "learning_rate": 8.733552631578947e-05, + "loss": 0.5289, + "step": 1995 + }, + { + "epoch": 2.214394674802385, + "grad_norm": 0.3871503472328186, + "learning_rate": 8.721217105263156e-05, + "loss": 0.4098, + "step": 1996 + }, + { + "epoch": 2.215504090972126, + "grad_norm": 0.3213239014148712, + "learning_rate": 8.708881578947367e-05, + "loss": 0.3835, + "step": 1997 + }, + { + "epoch": 2.2166135071418664, + "grad_norm": 0.5189967155456543, + "learning_rate": 8.696546052631579e-05, + "loss": 0.473, + "step": 1998 + }, + { + "epoch": 2.2177229233116074, + "grad_norm": 0.36058536171913147, + "learning_rate": 8.68421052631579e-05, + "loss": 0.3896, + "step": 1999 + }, + { + "epoch": 2.218832339481348, + "grad_norm": 0.5485463738441467, + "learning_rate": 8.671874999999999e-05, + "loss": 0.5556, + "step": 2000 + }, + { + "epoch": 2.2199417556510888, + "grad_norm": 0.3734520971775055, + "learning_rate": 8.65953947368421e-05, + "loss": 0.42, + "step": 2001 + }, + { + "epoch": 2.2210511718208292, + "grad_norm": 0.30071988701820374, + "learning_rate": 8.647203947368419e-05, + "loss": 0.4252, + "step": 2002 + }, + { + "epoch": 2.22216058799057, + "grad_norm": 0.4292794167995453, + "learning_rate": 8.634868421052631e-05, + "loss": 0.2889, + "step": 2003 + }, + { + "epoch": 2.2232700041603106, + "grad_norm": 0.31171557307243347, + "learning_rate": 8.622532894736842e-05, + "loss": 0.4064, + "step": 2004 + }, + { + "epoch": 2.2243794203300515, + "grad_norm": 0.4342403709888458, + "learning_rate": 8.610197368421052e-05, + "loss": 0.3495, + "step": 2005 + }, + { + "epoch": 2.225488836499792, + "grad_norm": 0.3274979591369629, + "learning_rate": 8.597861842105262e-05, + "loss": 0.2234, + "step": 2006 + }, + { + "epoch": 2.2265982526695325, + "grad_norm": 0.3761701285839081, + "learning_rate": 8.585526315789472e-05, + "loss": 0.3346, + "step": 2007 + }, + { + "epoch": 2.2277076688392734, + "grad_norm": 0.3312693238258362, + "learning_rate": 8.573190789473684e-05, + "loss": 0.4378, + "step": 2008 + }, + { + "epoch": 2.228817085009014, + "grad_norm": 0.4094376862049103, + "learning_rate": 8.560855263157894e-05, + "loss": 0.3139, + "step": 2009 + }, + { + "epoch": 2.229926501178755, + "grad_norm": 0.44111502170562744, + "learning_rate": 8.548519736842104e-05, + "loss": 0.5182, + "step": 2010 + }, + { + "epoch": 2.2310359173484953, + "grad_norm": 0.40903040766716003, + "learning_rate": 8.536184210526315e-05, + "loss": 0.2903, + "step": 2011 + }, + { + "epoch": 2.232145333518236, + "grad_norm": 0.38946643471717834, + "learning_rate": 8.523848684210526e-05, + "loss": 0.5439, + "step": 2012 + }, + { + "epoch": 2.2332547496879767, + "grad_norm": 0.3774378001689911, + "learning_rate": 8.511513157894736e-05, + "loss": 0.4863, + "step": 2013 + }, + { + "epoch": 2.2343641658577176, + "grad_norm": 0.2982938289642334, + "learning_rate": 8.499177631578947e-05, + "loss": 0.3545, + "step": 2014 + }, + { + "epoch": 2.235473582027458, + "grad_norm": 0.24514225125312805, + "learning_rate": 8.486842105263156e-05, + "loss": 0.2306, + "step": 2015 + }, + { + "epoch": 2.2365829981971985, + "grad_norm": 0.4611694812774658, + "learning_rate": 8.474506578947367e-05, + "loss": 0.3808, + "step": 2016 + }, + { + "epoch": 2.2376924143669394, + "grad_norm": 0.43070465326309204, + "learning_rate": 8.462171052631579e-05, + "loss": 0.2849, + "step": 2017 + }, + { + "epoch": 2.23880183053668, + "grad_norm": 0.45175713300704956, + "learning_rate": 8.449835526315788e-05, + "loss": 0.2813, + "step": 2018 + }, + { + "epoch": 2.239911246706421, + "grad_norm": 0.4600198268890381, + "learning_rate": 8.437499999999999e-05, + "loss": 0.374, + "step": 2019 + }, + { + "epoch": 2.2410206628761613, + "grad_norm": 0.8133832216262817, + "learning_rate": 8.42516447368421e-05, + "loss": 0.4012, + "step": 2020 + }, + { + "epoch": 2.2421300790459022, + "grad_norm": 0.3792482316493988, + "learning_rate": 8.412828947368419e-05, + "loss": 0.3851, + "step": 2021 + }, + { + "epoch": 2.2432394952156427, + "grad_norm": 0.38575461506843567, + "learning_rate": 8.400493421052631e-05, + "loss": 0.3673, + "step": 2022 + }, + { + "epoch": 2.2443489113853836, + "grad_norm": 0.4712158143520355, + "learning_rate": 8.388157894736842e-05, + "loss": 0.5148, + "step": 2023 + }, + { + "epoch": 2.245458327555124, + "grad_norm": 0.4250771105289459, + "learning_rate": 8.375822368421052e-05, + "loss": 0.3809, + "step": 2024 + }, + { + "epoch": 2.2465677437248646, + "grad_norm": 0.3241025507450104, + "learning_rate": 8.363486842105262e-05, + "loss": 0.4474, + "step": 2025 + }, + { + "epoch": 2.2476771598946055, + "grad_norm": 0.2825429141521454, + "learning_rate": 8.351151315789472e-05, + "loss": 0.3693, + "step": 2026 + }, + { + "epoch": 2.248786576064346, + "grad_norm": 0.3415563106536865, + "learning_rate": 8.338815789473684e-05, + "loss": 0.3112, + "step": 2027 + }, + { + "epoch": 2.249895992234087, + "grad_norm": 0.4205566346645355, + "learning_rate": 8.326480263157894e-05, + "loss": 0.4012, + "step": 2028 + }, + { + "epoch": 2.2510054084038273, + "grad_norm": 0.39186304807662964, + "learning_rate": 8.314144736842104e-05, + "loss": 0.406, + "step": 2029 + }, + { + "epoch": 2.2521148245735683, + "grad_norm": 0.44930022954940796, + "learning_rate": 8.301809210526315e-05, + "loss": 0.3568, + "step": 2030 + }, + { + "epoch": 2.2532242407433087, + "grad_norm": 0.37317511439323425, + "learning_rate": 8.289473684210526e-05, + "loss": 0.3248, + "step": 2031 + }, + { + "epoch": 2.2543336569130497, + "grad_norm": 0.4311521053314209, + "learning_rate": 8.277138157894736e-05, + "loss": 0.4463, + "step": 2032 + }, + { + "epoch": 2.25544307308279, + "grad_norm": 0.29921552538871765, + "learning_rate": 8.264802631578947e-05, + "loss": 0.2457, + "step": 2033 + }, + { + "epoch": 2.256552489252531, + "grad_norm": 0.3454459011554718, + "learning_rate": 8.252467105263156e-05, + "loss": 0.505, + "step": 2034 + }, + { + "epoch": 2.2576619054222715, + "grad_norm": 0.38671955466270447, + "learning_rate": 8.240131578947367e-05, + "loss": 0.3366, + "step": 2035 + }, + { + "epoch": 2.258771321592012, + "grad_norm": 0.3811526596546173, + "learning_rate": 8.227796052631579e-05, + "loss": 0.4974, + "step": 2036 + }, + { + "epoch": 2.259880737761753, + "grad_norm": 0.3260783851146698, + "learning_rate": 8.215460526315788e-05, + "loss": 0.2986, + "step": 2037 + }, + { + "epoch": 2.2609901539314934, + "grad_norm": 0.3846670091152191, + "learning_rate": 8.203124999999999e-05, + "loss": 0.3801, + "step": 2038 + }, + { + "epoch": 2.2620995701012343, + "grad_norm": 0.38990718126296997, + "learning_rate": 8.19078947368421e-05, + "loss": 0.4151, + "step": 2039 + }, + { + "epoch": 2.2632089862709748, + "grad_norm": 0.36695122718811035, + "learning_rate": 8.178453947368419e-05, + "loss": 0.4518, + "step": 2040 + }, + { + "epoch": 2.2643184024407157, + "grad_norm": 0.33559417724609375, + "learning_rate": 8.166118421052631e-05, + "loss": 0.3286, + "step": 2041 + }, + { + "epoch": 2.265427818610456, + "grad_norm": 0.3156730830669403, + "learning_rate": 8.153782894736842e-05, + "loss": 0.3759, + "step": 2042 + }, + { + "epoch": 2.266537234780197, + "grad_norm": 0.4046294391155243, + "learning_rate": 8.141447368421051e-05, + "loss": 0.3196, + "step": 2043 + }, + { + "epoch": 2.2676466509499376, + "grad_norm": 0.24606218934059143, + "learning_rate": 8.129111842105262e-05, + "loss": 0.2994, + "step": 2044 + }, + { + "epoch": 2.2687560671196785, + "grad_norm": 0.2781525254249573, + "learning_rate": 8.116776315789473e-05, + "loss": 0.4763, + "step": 2045 + }, + { + "epoch": 2.269865483289419, + "grad_norm": 0.3566399812698364, + "learning_rate": 8.104440789473685e-05, + "loss": 0.2819, + "step": 2046 + }, + { + "epoch": 2.2709748994591594, + "grad_norm": 0.36124187707901, + "learning_rate": 8.092105263157894e-05, + "loss": 0.4515, + "step": 2047 + }, + { + "epoch": 2.2720843156289003, + "grad_norm": 0.42289498448371887, + "learning_rate": 8.079769736842105e-05, + "loss": 0.2499, + "step": 2048 + }, + { + "epoch": 2.273193731798641, + "grad_norm": 0.38013771176338196, + "learning_rate": 8.067434210526315e-05, + "loss": 0.2947, + "step": 2049 + }, + { + "epoch": 2.2743031479683817, + "grad_norm": 0.44941648840904236, + "learning_rate": 8.055098684210526e-05, + "loss": 0.5397, + "step": 2050 + }, + { + "epoch": 2.275412564138122, + "grad_norm": 0.4005190432071686, + "learning_rate": 8.042763157894737e-05, + "loss": 0.3519, + "step": 2051 + }, + { + "epoch": 2.276521980307863, + "grad_norm": 0.4730212986469269, + "learning_rate": 8.030427631578947e-05, + "loss": 0.3143, + "step": 2052 + }, + { + "epoch": 2.2776313964776036, + "grad_norm": 0.249730184674263, + "learning_rate": 8.018092105263157e-05, + "loss": 0.288, + "step": 2053 + }, + { + "epoch": 2.2787408126473445, + "grad_norm": 0.4012918174266815, + "learning_rate": 8.005756578947367e-05, + "loss": 0.3855, + "step": 2054 + }, + { + "epoch": 2.279850228817085, + "grad_norm": 0.40905871987342834, + "learning_rate": 7.993421052631579e-05, + "loss": 0.3615, + "step": 2055 + }, + { + "epoch": 2.280959644986826, + "grad_norm": 0.34130680561065674, + "learning_rate": 7.981085526315789e-05, + "loss": 0.4656, + "step": 2056 + }, + { + "epoch": 2.2820690611565664, + "grad_norm": 0.5201243162155151, + "learning_rate": 7.968749999999999e-05, + "loss": 0.5578, + "step": 2057 + }, + { + "epoch": 2.283178477326307, + "grad_norm": 0.3649352490901947, + "learning_rate": 7.95641447368421e-05, + "loss": 0.5485, + "step": 2058 + }, + { + "epoch": 2.2842878934960478, + "grad_norm": 0.3599608242511749, + "learning_rate": 7.944078947368419e-05, + "loss": 0.384, + "step": 2059 + }, + { + "epoch": 2.2853973096657882, + "grad_norm": 0.35313576459884644, + "learning_rate": 7.931743421052631e-05, + "loss": 0.3998, + "step": 2060 + }, + { + "epoch": 2.286506725835529, + "grad_norm": 0.3734196424484253, + "learning_rate": 7.919407894736842e-05, + "loss": 0.3514, + "step": 2061 + }, + { + "epoch": 2.2876161420052696, + "grad_norm": 0.4057319462299347, + "learning_rate": 7.907072368421051e-05, + "loss": 0.5574, + "step": 2062 + }, + { + "epoch": 2.2887255581750106, + "grad_norm": 0.3745683431625366, + "learning_rate": 7.894736842105262e-05, + "loss": 0.4718, + "step": 2063 + }, + { + "epoch": 2.289834974344751, + "grad_norm": 0.2819893956184387, + "learning_rate": 7.882401315789473e-05, + "loss": 0.3068, + "step": 2064 + }, + { + "epoch": 2.2909443905144915, + "grad_norm": 0.35861438512802124, + "learning_rate": 7.870065789473685e-05, + "loss": 0.5004, + "step": 2065 + }, + { + "epoch": 2.2920538066842324, + "grad_norm": 0.3276369273662567, + "learning_rate": 7.857730263157894e-05, + "loss": 0.4357, + "step": 2066 + }, + { + "epoch": 2.2931632228539733, + "grad_norm": 0.6049783229827881, + "learning_rate": 7.845394736842105e-05, + "loss": 0.2697, + "step": 2067 + }, + { + "epoch": 2.294272639023714, + "grad_norm": 0.3910931348800659, + "learning_rate": 7.833059210526314e-05, + "loss": 0.4302, + "step": 2068 + }, + { + "epoch": 2.2953820551934543, + "grad_norm": 0.5576995611190796, + "learning_rate": 7.820723684210526e-05, + "loss": 0.3253, + "step": 2069 + }, + { + "epoch": 2.296491471363195, + "grad_norm": 0.43437716364860535, + "learning_rate": 7.808388157894737e-05, + "loss": 0.5236, + "step": 2070 + }, + { + "epoch": 2.2976008875329357, + "grad_norm": 0.3469353914260864, + "learning_rate": 7.796052631578947e-05, + "loss": 0.391, + "step": 2071 + }, + { + "epoch": 2.2987103037026766, + "grad_norm": 0.5261640548706055, + "learning_rate": 7.783717105263157e-05, + "loss": 0.3683, + "step": 2072 + }, + { + "epoch": 2.299819719872417, + "grad_norm": 0.4726692736148834, + "learning_rate": 7.771381578947367e-05, + "loss": 0.4087, + "step": 2073 + }, + { + "epoch": 2.300929136042158, + "grad_norm": 0.3760102391242981, + "learning_rate": 7.759046052631579e-05, + "loss": 0.424, + "step": 2074 + }, + { + "epoch": 2.3020385522118985, + "grad_norm": 0.38098374009132385, + "learning_rate": 7.746710526315789e-05, + "loss": 0.3709, + "step": 2075 + }, + { + "epoch": 2.303147968381639, + "grad_norm": 0.42381149530410767, + "learning_rate": 7.734374999999999e-05, + "loss": 0.5073, + "step": 2076 + }, + { + "epoch": 2.30425738455138, + "grad_norm": 0.27536872029304504, + "learning_rate": 7.72203947368421e-05, + "loss": 0.339, + "step": 2077 + }, + { + "epoch": 2.3053668007211203, + "grad_norm": 0.7497661113739014, + "learning_rate": 7.709703947368419e-05, + "loss": 0.4369, + "step": 2078 + }, + { + "epoch": 2.3064762168908612, + "grad_norm": 0.3368166387081146, + "learning_rate": 7.697368421052631e-05, + "loss": 0.4125, + "step": 2079 + }, + { + "epoch": 2.3075856330606017, + "grad_norm": 0.43875083327293396, + "learning_rate": 7.685032894736842e-05, + "loss": 0.2639, + "step": 2080 + }, + { + "epoch": 2.3086950492303426, + "grad_norm": 0.44184234738349915, + "learning_rate": 7.672697368421051e-05, + "loss": 0.3158, + "step": 2081 + }, + { + "epoch": 2.309804465400083, + "grad_norm": 0.3724750280380249, + "learning_rate": 7.660361842105262e-05, + "loss": 0.4184, + "step": 2082 + }, + { + "epoch": 2.310913881569824, + "grad_norm": 0.37243518233299255, + "learning_rate": 7.648026315789473e-05, + "loss": 0.3311, + "step": 2083 + }, + { + "epoch": 2.3120232977395645, + "grad_norm": 0.31451210379600525, + "learning_rate": 7.635690789473685e-05, + "loss": 0.4138, + "step": 2084 + }, + { + "epoch": 2.3131327139093054, + "grad_norm": 0.5280860066413879, + "learning_rate": 7.623355263157894e-05, + "loss": 0.3653, + "step": 2085 + }, + { + "epoch": 2.314242130079046, + "grad_norm": 0.4472182095050812, + "learning_rate": 7.611019736842105e-05, + "loss": 0.3394, + "step": 2086 + }, + { + "epoch": 2.3153515462487864, + "grad_norm": 0.39410504698753357, + "learning_rate": 7.598684210526314e-05, + "loss": 0.4175, + "step": 2087 + }, + { + "epoch": 2.3164609624185273, + "grad_norm": 0.30498364567756653, + "learning_rate": 7.586348684210526e-05, + "loss": 0.2771, + "step": 2088 + }, + { + "epoch": 2.3175703785882678, + "grad_norm": 0.3658243417739868, + "learning_rate": 7.574013157894737e-05, + "loss": 0.211, + "step": 2089 + }, + { + "epoch": 2.3186797947580087, + "grad_norm": 0.27675336599349976, + "learning_rate": 7.561677631578947e-05, + "loss": 0.4383, + "step": 2090 + }, + { + "epoch": 2.319789210927749, + "grad_norm": 0.47002074122428894, + "learning_rate": 7.549342105263157e-05, + "loss": 0.2742, + "step": 2091 + }, + { + "epoch": 2.32089862709749, + "grad_norm": 0.39124634861946106, + "learning_rate": 7.537006578947367e-05, + "loss": 0.3856, + "step": 2092 + }, + { + "epoch": 2.3220080432672305, + "grad_norm": 0.467118501663208, + "learning_rate": 7.52467105263158e-05, + "loss": 0.4138, + "step": 2093 + }, + { + "epoch": 2.3231174594369715, + "grad_norm": 0.29437050223350525, + "learning_rate": 7.512335526315789e-05, + "loss": 0.2429, + "step": 2094 + }, + { + "epoch": 2.324226875606712, + "grad_norm": 0.29561078548431396, + "learning_rate": 7.5e-05, + "loss": 0.3395, + "step": 2095 + }, + { + "epoch": 2.325336291776453, + "grad_norm": 0.5432490706443787, + "learning_rate": 7.48766447368421e-05, + "loss": 0.6463, + "step": 2096 + }, + { + "epoch": 2.3264457079461933, + "grad_norm": 0.32837289571762085, + "learning_rate": 7.475328947368421e-05, + "loss": 0.4072, + "step": 2097 + }, + { + "epoch": 2.327555124115934, + "grad_norm": 0.3328750729560852, + "learning_rate": 7.462993421052631e-05, + "loss": 0.6708, + "step": 2098 + }, + { + "epoch": 2.3286645402856747, + "grad_norm": 0.44809266924858093, + "learning_rate": 7.450657894736842e-05, + "loss": 0.2939, + "step": 2099 + }, + { + "epoch": 2.329773956455415, + "grad_norm": 0.4526784121990204, + "learning_rate": 7.438322368421051e-05, + "loss": 0.4913, + "step": 2100 + }, + { + "epoch": 2.330883372625156, + "grad_norm": 0.3075268864631653, + "learning_rate": 7.425986842105263e-05, + "loss": 0.3334, + "step": 2101 + }, + { + "epoch": 2.3319927887948966, + "grad_norm": 0.33832165598869324, + "learning_rate": 7.413651315789473e-05, + "loss": 0.3785, + "step": 2102 + }, + { + "epoch": 2.3331022049646375, + "grad_norm": 0.3870348632335663, + "learning_rate": 7.401315789473683e-05, + "loss": 0.4643, + "step": 2103 + }, + { + "epoch": 2.334211621134378, + "grad_norm": 0.39963188767433167, + "learning_rate": 7.388980263157894e-05, + "loss": 0.296, + "step": 2104 + }, + { + "epoch": 2.335321037304119, + "grad_norm": 0.40964365005493164, + "learning_rate": 7.376644736842105e-05, + "loss": 0.3673, + "step": 2105 + }, + { + "epoch": 2.3364304534738594, + "grad_norm": 0.31597647070884705, + "learning_rate": 7.364309210526315e-05, + "loss": 0.3001, + "step": 2106 + }, + { + "epoch": 2.3375398696436003, + "grad_norm": 0.364797443151474, + "learning_rate": 7.351973684210526e-05, + "loss": 0.3574, + "step": 2107 + }, + { + "epoch": 2.3386492858133408, + "grad_norm": 0.4309171438217163, + "learning_rate": 7.339638157894735e-05, + "loss": 0.5573, + "step": 2108 + }, + { + "epoch": 2.3397587019830812, + "grad_norm": 0.34616556763648987, + "learning_rate": 7.327302631578947e-05, + "loss": 0.4426, + "step": 2109 + }, + { + "epoch": 2.340868118152822, + "grad_norm": 0.6331126689910889, + "learning_rate": 7.314967105263157e-05, + "loss": 0.7939, + "step": 2110 + }, + { + "epoch": 2.3419775343225626, + "grad_norm": 0.43343913555145264, + "learning_rate": 7.302631578947367e-05, + "loss": 0.3552, + "step": 2111 + }, + { + "epoch": 2.3430869504923035, + "grad_norm": 0.42529061436653137, + "learning_rate": 7.290296052631578e-05, + "loss": 0.5603, + "step": 2112 + }, + { + "epoch": 2.344196366662044, + "grad_norm": 0.32142138481140137, + "learning_rate": 7.277960526315789e-05, + "loss": 0.2149, + "step": 2113 + }, + { + "epoch": 2.345305782831785, + "grad_norm": 0.29198598861694336, + "learning_rate": 7.265625e-05, + "loss": 0.3719, + "step": 2114 + }, + { + "epoch": 2.3464151990015254, + "grad_norm": 0.36399996280670166, + "learning_rate": 7.25328947368421e-05, + "loss": 0.369, + "step": 2115 + }, + { + "epoch": 2.347524615171266, + "grad_norm": 0.288010835647583, + "learning_rate": 7.240953947368421e-05, + "loss": 0.4388, + "step": 2116 + }, + { + "epoch": 2.348634031341007, + "grad_norm": 0.45767742395401, + "learning_rate": 7.22861842105263e-05, + "loss": 0.5151, + "step": 2117 + }, + { + "epoch": 2.3497434475107477, + "grad_norm": 0.4015630781650543, + "learning_rate": 7.216282894736842e-05, + "loss": 0.2633, + "step": 2118 + }, + { + "epoch": 2.350852863680488, + "grad_norm": 0.4112047553062439, + "learning_rate": 7.203947368421051e-05, + "loss": 0.4331, + "step": 2119 + }, + { + "epoch": 2.3519622798502287, + "grad_norm": 0.4492191672325134, + "learning_rate": 7.191611842105263e-05, + "loss": 0.3771, + "step": 2120 + }, + { + "epoch": 2.3530716960199696, + "grad_norm": 0.36163103580474854, + "learning_rate": 7.179276315789473e-05, + "loss": 0.4598, + "step": 2121 + }, + { + "epoch": 2.35418111218971, + "grad_norm": 0.3377210795879364, + "learning_rate": 7.166940789473683e-05, + "loss": 0.1919, + "step": 2122 + }, + { + "epoch": 2.355290528359451, + "grad_norm": 0.6023211479187012, + "learning_rate": 7.154605263157894e-05, + "loss": 0.4939, + "step": 2123 + }, + { + "epoch": 2.3563999445291914, + "grad_norm": 0.31167641282081604, + "learning_rate": 7.142269736842105e-05, + "loss": 0.4228, + "step": 2124 + }, + { + "epoch": 2.3575093606989324, + "grad_norm": 0.39771780371665955, + "learning_rate": 7.129934210526315e-05, + "loss": 0.4054, + "step": 2125 + }, + { + "epoch": 2.358618776868673, + "grad_norm": 0.45539480447769165, + "learning_rate": 7.117598684210526e-05, + "loss": 0.4532, + "step": 2126 + }, + { + "epoch": 2.3597281930384133, + "grad_norm": 0.3616165220737457, + "learning_rate": 7.105263157894735e-05, + "loss": 0.4083, + "step": 2127 + }, + { + "epoch": 2.360837609208154, + "grad_norm": 0.3542384207248688, + "learning_rate": 7.092927631578947e-05, + "loss": 0.4871, + "step": 2128 + }, + { + "epoch": 2.3619470253778947, + "grad_norm": 0.3284272253513336, + "learning_rate": 7.080592105263157e-05, + "loss": 0.2563, + "step": 2129 + }, + { + "epoch": 2.3630564415476356, + "grad_norm": 0.48142263293266296, + "learning_rate": 7.068256578947367e-05, + "loss": 0.3424, + "step": 2130 + }, + { + "epoch": 2.364165857717376, + "grad_norm": 0.39508047699928284, + "learning_rate": 7.055921052631578e-05, + "loss": 0.4946, + "step": 2131 + }, + { + "epoch": 2.365275273887117, + "grad_norm": 0.31679755449295044, + "learning_rate": 7.043585526315789e-05, + "loss": 0.494, + "step": 2132 + }, + { + "epoch": 2.3663846900568575, + "grad_norm": 0.43126723170280457, + "learning_rate": 7.03125e-05, + "loss": 0.3824, + "step": 2133 + }, + { + "epoch": 2.3674941062265984, + "grad_norm": 0.43794259428977966, + "learning_rate": 7.01891447368421e-05, + "loss": 0.3238, + "step": 2134 + }, + { + "epoch": 2.368603522396339, + "grad_norm": 0.3279634714126587, + "learning_rate": 7.006578947368421e-05, + "loss": 0.3239, + "step": 2135 + }, + { + "epoch": 2.36971293856608, + "grad_norm": 0.32456138730049133, + "learning_rate": 6.99424342105263e-05, + "loss": 0.316, + "step": 2136 + }, + { + "epoch": 2.3708223547358203, + "grad_norm": 0.4760146141052246, + "learning_rate": 6.981907894736842e-05, + "loss": 0.3338, + "step": 2137 + }, + { + "epoch": 2.3719317709055607, + "grad_norm": 0.5461307168006897, + "learning_rate": 6.969572368421051e-05, + "loss": 0.2907, + "step": 2138 + }, + { + "epoch": 2.3730411870753017, + "grad_norm": 0.3460582494735718, + "learning_rate": 6.957236842105264e-05, + "loss": 0.3996, + "step": 2139 + }, + { + "epoch": 2.374150603245042, + "grad_norm": 0.41096046566963196, + "learning_rate": 6.944901315789473e-05, + "loss": 0.2277, + "step": 2140 + }, + { + "epoch": 2.375260019414783, + "grad_norm": 0.48936372995376587, + "learning_rate": 6.932565789473683e-05, + "loss": 0.3666, + "step": 2141 + }, + { + "epoch": 2.3763694355845235, + "grad_norm": 0.36358651518821716, + "learning_rate": 6.920230263157894e-05, + "loss": 0.3881, + "step": 2142 + }, + { + "epoch": 2.3774788517542644, + "grad_norm": 0.45791712403297424, + "learning_rate": 6.907894736842105e-05, + "loss": 0.3655, + "step": 2143 + }, + { + "epoch": 2.378588267924005, + "grad_norm": 0.3084203898906708, + "learning_rate": 6.895559210526316e-05, + "loss": 0.3494, + "step": 2144 + }, + { + "epoch": 2.379697684093746, + "grad_norm": 0.5072697997093201, + "learning_rate": 6.883223684210526e-05, + "loss": 0.4923, + "step": 2145 + }, + { + "epoch": 2.3808071002634863, + "grad_norm": 0.5244908928871155, + "learning_rate": 6.870888157894735e-05, + "loss": 0.4565, + "step": 2146 + }, + { + "epoch": 2.381916516433227, + "grad_norm": 0.3606557548046112, + "learning_rate": 6.858552631578948e-05, + "loss": 0.2892, + "step": 2147 + }, + { + "epoch": 2.3830259326029677, + "grad_norm": 0.5143904089927673, + "learning_rate": 6.846217105263157e-05, + "loss": 0.3186, + "step": 2148 + }, + { + "epoch": 2.384135348772708, + "grad_norm": 0.42753830552101135, + "learning_rate": 6.833881578947368e-05, + "loss": 0.4308, + "step": 2149 + }, + { + "epoch": 2.385244764942449, + "grad_norm": 0.32439666986465454, + "learning_rate": 6.821546052631578e-05, + "loss": 0.3906, + "step": 2150 + }, + { + "epoch": 2.3863541811121896, + "grad_norm": 0.4810985028743744, + "learning_rate": 6.809210526315789e-05, + "loss": 0.3133, + "step": 2151 + }, + { + "epoch": 2.3874635972819305, + "grad_norm": 0.4014139473438263, + "learning_rate": 6.796875e-05, + "loss": 0.3086, + "step": 2152 + }, + { + "epoch": 2.388573013451671, + "grad_norm": 0.3334631621837616, + "learning_rate": 6.78453947368421e-05, + "loss": 0.3678, + "step": 2153 + }, + { + "epoch": 2.389682429621412, + "grad_norm": 0.6455233097076416, + "learning_rate": 6.772203947368421e-05, + "loss": 0.3285, + "step": 2154 + }, + { + "epoch": 2.3907918457911523, + "grad_norm": 0.3901275396347046, + "learning_rate": 6.75986842105263e-05, + "loss": 0.3597, + "step": 2155 + }, + { + "epoch": 2.3919012619608933, + "grad_norm": 0.4130385220050812, + "learning_rate": 6.747532894736842e-05, + "loss": 0.3987, + "step": 2156 + }, + { + "epoch": 2.3930106781306337, + "grad_norm": 0.4633021652698517, + "learning_rate": 6.735197368421052e-05, + "loss": 0.3555, + "step": 2157 + }, + { + "epoch": 2.3941200943003746, + "grad_norm": 0.3566206097602844, + "learning_rate": 6.722861842105264e-05, + "loss": 0.5682, + "step": 2158 + }, + { + "epoch": 2.395229510470115, + "grad_norm": 0.37410175800323486, + "learning_rate": 6.710526315789473e-05, + "loss": 0.5559, + "step": 2159 + }, + { + "epoch": 2.3963389266398556, + "grad_norm": 0.47398602962493896, + "learning_rate": 6.698190789473684e-05, + "loss": 0.3974, + "step": 2160 + }, + { + "epoch": 2.3974483428095965, + "grad_norm": 0.3041347563266754, + "learning_rate": 6.685855263157894e-05, + "loss": 0.306, + "step": 2161 + }, + { + "epoch": 2.398557758979337, + "grad_norm": 0.6512690782546997, + "learning_rate": 6.673519736842105e-05, + "loss": 0.3073, + "step": 2162 + }, + { + "epoch": 2.399667175149078, + "grad_norm": 0.3038191497325897, + "learning_rate": 6.661184210526316e-05, + "loss": 0.3592, + "step": 2163 + }, + { + "epoch": 2.4007765913188184, + "grad_norm": 0.30106794834136963, + "learning_rate": 6.648848684210526e-05, + "loss": 0.3012, + "step": 2164 + }, + { + "epoch": 2.4018860074885593, + "grad_norm": 0.33144044876098633, + "learning_rate": 6.636513157894736e-05, + "loss": 0.4192, + "step": 2165 + }, + { + "epoch": 2.4029954236582998, + "grad_norm": 0.46323060989379883, + "learning_rate": 6.624177631578946e-05, + "loss": 0.2883, + "step": 2166 + }, + { + "epoch": 2.4041048398280407, + "grad_norm": 0.39412403106689453, + "learning_rate": 6.611842105263157e-05, + "loss": 0.3982, + "step": 2167 + }, + { + "epoch": 2.405214255997781, + "grad_norm": 0.4328696131706238, + "learning_rate": 6.599506578947368e-05, + "loss": 0.2954, + "step": 2168 + }, + { + "epoch": 2.406323672167522, + "grad_norm": 0.4066632390022278, + "learning_rate": 6.587171052631578e-05, + "loss": 0.5436, + "step": 2169 + }, + { + "epoch": 2.4074330883372626, + "grad_norm": 0.41099557280540466, + "learning_rate": 6.574835526315789e-05, + "loss": 0.3294, + "step": 2170 + }, + { + "epoch": 2.408542504507003, + "grad_norm": 0.44337305426597595, + "learning_rate": 6.5625e-05, + "loss": 0.4187, + "step": 2171 + }, + { + "epoch": 2.409651920676744, + "grad_norm": 0.43825507164001465, + "learning_rate": 6.55016447368421e-05, + "loss": 0.4014, + "step": 2172 + }, + { + "epoch": 2.4107613368464844, + "grad_norm": 0.37434348464012146, + "learning_rate": 6.537828947368421e-05, + "loss": 0.3387, + "step": 2173 + }, + { + "epoch": 2.4118707530162253, + "grad_norm": 0.49352914094924927, + "learning_rate": 6.52549342105263e-05, + "loss": 0.538, + "step": 2174 + }, + { + "epoch": 2.412980169185966, + "grad_norm": 0.3878787159919739, + "learning_rate": 6.513157894736842e-05, + "loss": 0.355, + "step": 2175 + }, + { + "epoch": 2.4140895853557067, + "grad_norm": 0.4009630084037781, + "learning_rate": 6.500822368421052e-05, + "loss": 0.2911, + "step": 2176 + }, + { + "epoch": 2.415199001525447, + "grad_norm": 0.5621581077575684, + "learning_rate": 6.488486842105264e-05, + "loss": 0.3631, + "step": 2177 + }, + { + "epoch": 2.4163084176951877, + "grad_norm": 0.5268844366073608, + "learning_rate": 6.476151315789473e-05, + "loss": 0.1947, + "step": 2178 + }, + { + "epoch": 2.4174178338649286, + "grad_norm": 0.4071340560913086, + "learning_rate": 6.463815789473684e-05, + "loss": 0.4349, + "step": 2179 + }, + { + "epoch": 2.418527250034669, + "grad_norm": 0.5059108734130859, + "learning_rate": 6.451480263157894e-05, + "loss": 0.4169, + "step": 2180 + }, + { + "epoch": 2.41963666620441, + "grad_norm": 0.3529709279537201, + "learning_rate": 6.439144736842105e-05, + "loss": 0.3216, + "step": 2181 + }, + { + "epoch": 2.4207460823741505, + "grad_norm": 0.3386879563331604, + "learning_rate": 6.426809210526316e-05, + "loss": 0.3775, + "step": 2182 + }, + { + "epoch": 2.4218554985438914, + "grad_norm": 0.4721252918243408, + "learning_rate": 6.414473684210526e-05, + "loss": 0.3298, + "step": 2183 + }, + { + "epoch": 2.422964914713632, + "grad_norm": 0.58592289686203, + "learning_rate": 6.402138157894736e-05, + "loss": 0.3353, + "step": 2184 + }, + { + "epoch": 2.4240743308833728, + "grad_norm": 0.3879697918891907, + "learning_rate": 6.389802631578946e-05, + "loss": 0.3383, + "step": 2185 + }, + { + "epoch": 2.4251837470531132, + "grad_norm": 0.5064356327056885, + "learning_rate": 6.377467105263157e-05, + "loss": 0.2966, + "step": 2186 + }, + { + "epoch": 2.426293163222854, + "grad_norm": 0.4402575194835663, + "learning_rate": 6.365131578947368e-05, + "loss": 0.3114, + "step": 2187 + }, + { + "epoch": 2.4274025793925946, + "grad_norm": 0.5321472883224487, + "learning_rate": 6.352796052631578e-05, + "loss": 0.4891, + "step": 2188 + }, + { + "epoch": 2.428511995562335, + "grad_norm": 0.4424992799758911, + "learning_rate": 6.340460526315789e-05, + "loss": 0.5393, + "step": 2189 + }, + { + "epoch": 2.429621411732076, + "grad_norm": 0.37487635016441345, + "learning_rate": 6.328125e-05, + "loss": 0.2881, + "step": 2190 + }, + { + "epoch": 2.4307308279018165, + "grad_norm": 0.4092381000518799, + "learning_rate": 6.315789473684209e-05, + "loss": 0.3569, + "step": 2191 + }, + { + "epoch": 2.4318402440715574, + "grad_norm": 0.43235230445861816, + "learning_rate": 6.303453947368421e-05, + "loss": 0.3249, + "step": 2192 + }, + { + "epoch": 2.432949660241298, + "grad_norm": 0.4262683689594269, + "learning_rate": 6.29111842105263e-05, + "loss": 0.3653, + "step": 2193 + }, + { + "epoch": 2.434059076411039, + "grad_norm": 0.46941083669662476, + "learning_rate": 6.278782894736842e-05, + "loss": 0.3763, + "step": 2194 + }, + { + "epoch": 2.4351684925807793, + "grad_norm": 0.44827941060066223, + "learning_rate": 6.266447368421052e-05, + "loss": 0.5289, + "step": 2195 + }, + { + "epoch": 2.43627790875052, + "grad_norm": 0.4619573652744293, + "learning_rate": 6.254111842105262e-05, + "loss": 0.3311, + "step": 2196 + }, + { + "epoch": 2.4373873249202607, + "grad_norm": 0.4534587562084198, + "learning_rate": 6.241776315789473e-05, + "loss": 0.5748, + "step": 2197 + }, + { + "epoch": 2.4384967410900016, + "grad_norm": 0.5071384906768799, + "learning_rate": 6.229440789473684e-05, + "loss": 0.436, + "step": 2198 + }, + { + "epoch": 2.439606157259742, + "grad_norm": 0.31029054522514343, + "learning_rate": 6.217105263157894e-05, + "loss": 0.4601, + "step": 2199 + }, + { + "epoch": 2.4407155734294825, + "grad_norm": 0.6883265972137451, + "learning_rate": 6.204769736842105e-05, + "loss": 0.7862, + "step": 2200 + }, + { + "epoch": 2.4418249895992234, + "grad_norm": 0.5015659332275391, + "learning_rate": 6.192434210526316e-05, + "loss": 0.589, + "step": 2201 + }, + { + "epoch": 2.442934405768964, + "grad_norm": 0.5587977170944214, + "learning_rate": 6.180098684210526e-05, + "loss": 0.5756, + "step": 2202 + }, + { + "epoch": 2.444043821938705, + "grad_norm": 0.3763371706008911, + "learning_rate": 6.167763157894736e-05, + "loss": 0.3825, + "step": 2203 + }, + { + "epoch": 2.4451532381084453, + "grad_norm": 0.39078426361083984, + "learning_rate": 6.155427631578946e-05, + "loss": 0.2897, + "step": 2204 + }, + { + "epoch": 2.4462626542781862, + "grad_norm": 0.2745046615600586, + "learning_rate": 6.143092105263157e-05, + "loss": 0.3853, + "step": 2205 + }, + { + "epoch": 2.4473720704479267, + "grad_norm": 0.3368263840675354, + "learning_rate": 6.130756578947368e-05, + "loss": 0.4171, + "step": 2206 + }, + { + "epoch": 2.4484814866176676, + "grad_norm": 0.3768693208694458, + "learning_rate": 6.118421052631578e-05, + "loss": 0.3516, + "step": 2207 + }, + { + "epoch": 2.449590902787408, + "grad_norm": 0.7670299410820007, + "learning_rate": 6.106085526315789e-05, + "loss": 0.4042, + "step": 2208 + }, + { + "epoch": 2.450700318957149, + "grad_norm": 0.332269549369812, + "learning_rate": 6.09375e-05, + "loss": 0.4697, + "step": 2209 + }, + { + "epoch": 2.4518097351268895, + "grad_norm": 0.3532228171825409, + "learning_rate": 6.08141447368421e-05, + "loss": 0.3612, + "step": 2210 + }, + { + "epoch": 2.45291915129663, + "grad_norm": 0.39647355675697327, + "learning_rate": 6.0690789473684204e-05, + "loss": 0.3681, + "step": 2211 + }, + { + "epoch": 2.454028567466371, + "grad_norm": 0.3135978579521179, + "learning_rate": 6.056743421052631e-05, + "loss": 0.3349, + "step": 2212 + }, + { + "epoch": 2.4551379836361114, + "grad_norm": 0.3503597378730774, + "learning_rate": 6.044407894736842e-05, + "loss": 0.4885, + "step": 2213 + }, + { + "epoch": 2.4562473998058523, + "grad_norm": 0.3332939147949219, + "learning_rate": 6.032072368421052e-05, + "loss": 0.494, + "step": 2214 + }, + { + "epoch": 2.4573568159755927, + "grad_norm": 0.24943743646144867, + "learning_rate": 6.019736842105263e-05, + "loss": 0.2715, + "step": 2215 + }, + { + "epoch": 2.4584662321453337, + "grad_norm": 0.3770546615123749, + "learning_rate": 6.007401315789473e-05, + "loss": 0.4246, + "step": 2216 + }, + { + "epoch": 2.459575648315074, + "grad_norm": 0.474202960729599, + "learning_rate": 5.995065789473683e-05, + "loss": 0.4433, + "step": 2217 + }, + { + "epoch": 2.460685064484815, + "grad_norm": 0.5108838081359863, + "learning_rate": 5.9827302631578944e-05, + "loss": 0.3707, + "step": 2218 + }, + { + "epoch": 2.4617944806545555, + "grad_norm": 0.3123144507408142, + "learning_rate": 5.9703947368421044e-05, + "loss": 0.5036, + "step": 2219 + }, + { + "epoch": 2.4629038968242964, + "grad_norm": 0.5090911984443665, + "learning_rate": 5.958059210526316e-05, + "loss": 0.2868, + "step": 2220 + }, + { + "epoch": 2.464013312994037, + "grad_norm": 0.300322562456131, + "learning_rate": 5.945723684210526e-05, + "loss": 0.4943, + "step": 2221 + }, + { + "epoch": 2.4651227291637774, + "grad_norm": 0.5102697014808655, + "learning_rate": 5.933388157894736e-05, + "loss": 0.2345, + "step": 2222 + }, + { + "epoch": 2.4662321453335183, + "grad_norm": 0.28978249430656433, + "learning_rate": 5.921052631578947e-05, + "loss": 0.4279, + "step": 2223 + }, + { + "epoch": 2.467341561503259, + "grad_norm": 0.29028403759002686, + "learning_rate": 5.908717105263157e-05, + "loss": 0.4429, + "step": 2224 + }, + { + "epoch": 2.4684509776729997, + "grad_norm": 0.2963179349899292, + "learning_rate": 5.8963815789473684e-05, + "loss": 0.2816, + "step": 2225 + }, + { + "epoch": 2.46956039384274, + "grad_norm": 0.45071807503700256, + "learning_rate": 5.8840460526315784e-05, + "loss": 0.4315, + "step": 2226 + }, + { + "epoch": 2.470669810012481, + "grad_norm": 0.5235294699668884, + "learning_rate": 5.871710526315789e-05, + "loss": 0.4252, + "step": 2227 + }, + { + "epoch": 2.4717792261822216, + "grad_norm": 0.3098270893096924, + "learning_rate": 5.859375e-05, + "loss": 0.5957, + "step": 2228 + }, + { + "epoch": 2.472888642351962, + "grad_norm": 0.4504237473011017, + "learning_rate": 5.84703947368421e-05, + "loss": 0.4309, + "step": 2229 + }, + { + "epoch": 2.473998058521703, + "grad_norm": 0.3145286738872528, + "learning_rate": 5.8347039473684205e-05, + "loss": 0.4284, + "step": 2230 + }, + { + "epoch": 2.475107474691444, + "grad_norm": 0.39920830726623535, + "learning_rate": 5.822368421052631e-05, + "loss": 0.3452, + "step": 2231 + }, + { + "epoch": 2.4762168908611843, + "grad_norm": 0.5176841020584106, + "learning_rate": 5.810032894736842e-05, + "loss": 0.3673, + "step": 2232 + }, + { + "epoch": 2.477326307030925, + "grad_norm": 0.3197839558124542, + "learning_rate": 5.797697368421052e-05, + "loss": 0.3608, + "step": 2233 + }, + { + "epoch": 2.4784357232006657, + "grad_norm": 0.46264639496803284, + "learning_rate": 5.785361842105263e-05, + "loss": 0.3975, + "step": 2234 + }, + { + "epoch": 2.479545139370406, + "grad_norm": 0.6301188468933105, + "learning_rate": 5.773026315789473e-05, + "loss": 0.5299, + "step": 2235 + }, + { + "epoch": 2.480654555540147, + "grad_norm": 0.34785377979278564, + "learning_rate": 5.760690789473683e-05, + "loss": 0.3115, + "step": 2236 + }, + { + "epoch": 2.4817639717098876, + "grad_norm": 0.658812403678894, + "learning_rate": 5.7483552631578945e-05, + "loss": 0.363, + "step": 2237 + }, + { + "epoch": 2.4828733878796285, + "grad_norm": 0.5214020013809204, + "learning_rate": 5.7360197368421045e-05, + "loss": 0.4809, + "step": 2238 + }, + { + "epoch": 2.483982804049369, + "grad_norm": 0.5607793927192688, + "learning_rate": 5.723684210526316e-05, + "loss": 0.2884, + "step": 2239 + }, + { + "epoch": 2.4850922202191095, + "grad_norm": 0.39174753427505493, + "learning_rate": 5.711348684210526e-05, + "loss": 0.353, + "step": 2240 + }, + { + "epoch": 2.4862016363888504, + "grad_norm": 0.3475854694843292, + "learning_rate": 5.699013157894736e-05, + "loss": 0.4008, + "step": 2241 + }, + { + "epoch": 2.487311052558591, + "grad_norm": 0.5239514708518982, + "learning_rate": 5.686677631578947e-05, + "loss": 0.4618, + "step": 2242 + }, + { + "epoch": 2.488420468728332, + "grad_norm": 0.358995646238327, + "learning_rate": 5.674342105263157e-05, + "loss": 0.4906, + "step": 2243 + }, + { + "epoch": 2.4895298848980723, + "grad_norm": 0.3828662633895874, + "learning_rate": 5.662006578947368e-05, + "loss": 0.2941, + "step": 2244 + }, + { + "epoch": 2.490639301067813, + "grad_norm": 0.43704545497894287, + "learning_rate": 5.6496710526315785e-05, + "loss": 0.3913, + "step": 2245 + }, + { + "epoch": 2.4917487172375536, + "grad_norm": 0.29927560687065125, + "learning_rate": 5.637335526315789e-05, + "loss": 0.2661, + "step": 2246 + }, + { + "epoch": 2.4928581334072946, + "grad_norm": 0.36267396807670593, + "learning_rate": 5.625e-05, + "loss": 0.2605, + "step": 2247 + }, + { + "epoch": 2.493967549577035, + "grad_norm": 0.34938421845436096, + "learning_rate": 5.61266447368421e-05, + "loss": 0.3623, + "step": 2248 + }, + { + "epoch": 2.495076965746776, + "grad_norm": 0.4633193016052246, + "learning_rate": 5.6003289473684205e-05, + "loss": 0.3735, + "step": 2249 + }, + { + "epoch": 2.4961863819165164, + "grad_norm": 0.3852117359638214, + "learning_rate": 5.587993421052631e-05, + "loss": 0.5006, + "step": 2250 + }, + { + "epoch": 2.497295798086257, + "grad_norm": 0.528650164604187, + "learning_rate": 5.575657894736842e-05, + "loss": 0.4337, + "step": 2251 + }, + { + "epoch": 2.498405214255998, + "grad_norm": 0.3810504972934723, + "learning_rate": 5.563322368421052e-05, + "loss": 0.3518, + "step": 2252 + }, + { + "epoch": 2.4995146304257383, + "grad_norm": 0.35557541251182556, + "learning_rate": 5.550986842105263e-05, + "loss": 0.3932, + "step": 2253 + }, + { + "epoch": 2.500624046595479, + "grad_norm": 0.2806094288825989, + "learning_rate": 5.538651315789473e-05, + "loss": 0.2386, + "step": 2254 + }, + { + "epoch": 2.5017334627652197, + "grad_norm": 0.42694249749183655, + "learning_rate": 5.526315789473683e-05, + "loss": 0.4441, + "step": 2255 + }, + { + "epoch": 2.5028428789349606, + "grad_norm": 0.32019782066345215, + "learning_rate": 5.5139802631578945e-05, + "loss": 0.3024, + "step": 2256 + }, + { + "epoch": 2.503952295104701, + "grad_norm": 0.43957844376564026, + "learning_rate": 5.5016447368421045e-05, + "loss": 0.3916, + "step": 2257 + }, + { + "epoch": 2.5050617112744415, + "grad_norm": 0.39406758546829224, + "learning_rate": 5.489309210526316e-05, + "loss": 0.3808, + "step": 2258 + }, + { + "epoch": 2.5061711274441825, + "grad_norm": 0.36642491817474365, + "learning_rate": 5.476973684210526e-05, + "loss": 0.4663, + "step": 2259 + }, + { + "epoch": 2.5072805436139234, + "grad_norm": 0.3601360619068146, + "learning_rate": 5.464638157894736e-05, + "loss": 0.4061, + "step": 2260 + }, + { + "epoch": 2.508389959783664, + "grad_norm": 0.406791627407074, + "learning_rate": 5.452302631578947e-05, + "loss": 0.3205, + "step": 2261 + }, + { + "epoch": 2.5094993759534043, + "grad_norm": 0.3350258469581604, + "learning_rate": 5.439967105263157e-05, + "loss": 0.2646, + "step": 2262 + }, + { + "epoch": 2.5106087921231452, + "grad_norm": 0.48063212633132935, + "learning_rate": 5.427631578947368e-05, + "loss": 0.4569, + "step": 2263 + }, + { + "epoch": 2.5117182082928857, + "grad_norm": 0.4414843022823334, + "learning_rate": 5.4152960526315786e-05, + "loss": 0.3192, + "step": 2264 + }, + { + "epoch": 2.5128276244626266, + "grad_norm": 0.4843035042285919, + "learning_rate": 5.402960526315789e-05, + "loss": 0.3547, + "step": 2265 + }, + { + "epoch": 2.513937040632367, + "grad_norm": 0.30912336707115173, + "learning_rate": 5.390624999999999e-05, + "loss": 0.4092, + "step": 2266 + }, + { + "epoch": 2.515046456802108, + "grad_norm": 0.3699786365032196, + "learning_rate": 5.37828947368421e-05, + "loss": 0.3192, + "step": 2267 + }, + { + "epoch": 2.5161558729718485, + "grad_norm": 0.6730918884277344, + "learning_rate": 5.3659539473684206e-05, + "loss": 0.4083, + "step": 2268 + }, + { + "epoch": 2.517265289141589, + "grad_norm": 0.44793230295181274, + "learning_rate": 5.3536184210526306e-05, + "loss": 0.3056, + "step": 2269 + }, + { + "epoch": 2.51837470531133, + "grad_norm": 0.48553967475891113, + "learning_rate": 5.341282894736842e-05, + "loss": 0.2853, + "step": 2270 + }, + { + "epoch": 2.519484121481071, + "grad_norm": 0.3934726417064667, + "learning_rate": 5.328947368421052e-05, + "loss": 0.252, + "step": 2271 + }, + { + "epoch": 2.5205935376508113, + "grad_norm": 0.44591614603996277, + "learning_rate": 5.316611842105263e-05, + "loss": 0.5068, + "step": 2272 + }, + { + "epoch": 2.5217029538205518, + "grad_norm": 0.40541309118270874, + "learning_rate": 5.304276315789473e-05, + "loss": 0.427, + "step": 2273 + }, + { + "epoch": 2.5228123699902927, + "grad_norm": 0.4913922846317291, + "learning_rate": 5.291940789473683e-05, + "loss": 0.3514, + "step": 2274 + }, + { + "epoch": 2.523921786160033, + "grad_norm": 0.3643013834953308, + "learning_rate": 5.2796052631578946e-05, + "loss": 0.2301, + "step": 2275 + }, + { + "epoch": 2.525031202329774, + "grad_norm": 0.389931321144104, + "learning_rate": 5.2672697368421046e-05, + "loss": 0.5792, + "step": 2276 + }, + { + "epoch": 2.5261406184995145, + "grad_norm": 0.353927344083786, + "learning_rate": 5.254934210526316e-05, + "loss": 0.2489, + "step": 2277 + }, + { + "epoch": 2.5272500346692555, + "grad_norm": 0.6424699425697327, + "learning_rate": 5.242598684210526e-05, + "loss": 0.4858, + "step": 2278 + }, + { + "epoch": 2.528359450838996, + "grad_norm": 0.35985687375068665, + "learning_rate": 5.230263157894736e-05, + "loss": 0.3414, + "step": 2279 + }, + { + "epoch": 2.5294688670087364, + "grad_norm": 0.4226178228855133, + "learning_rate": 5.217927631578947e-05, + "loss": 0.4188, + "step": 2280 + }, + { + "epoch": 2.5305782831784773, + "grad_norm": 0.3257390558719635, + "learning_rate": 5.205592105263157e-05, + "loss": 0.4046, + "step": 2281 + }, + { + "epoch": 2.5316876993482182, + "grad_norm": 0.31944599747657776, + "learning_rate": 5.193256578947368e-05, + "loss": 0.3429, + "step": 2282 + }, + { + "epoch": 2.5327971155179587, + "grad_norm": 0.3145938515663147, + "learning_rate": 5.1809210526315786e-05, + "loss": 0.4005, + "step": 2283 + }, + { + "epoch": 2.533906531687699, + "grad_norm": 0.4402817189693451, + "learning_rate": 5.168585526315789e-05, + "loss": 0.4823, + "step": 2284 + }, + { + "epoch": 2.53501594785744, + "grad_norm": 0.43508538603782654, + "learning_rate": 5.156249999999999e-05, + "loss": 0.3428, + "step": 2285 + }, + { + "epoch": 2.5361253640271806, + "grad_norm": 0.4584806561470032, + "learning_rate": 5.14391447368421e-05, + "loss": 0.396, + "step": 2286 + }, + { + "epoch": 2.5372347801969215, + "grad_norm": 0.454815149307251, + "learning_rate": 5.1315789473684206e-05, + "loss": 0.4953, + "step": 2287 + }, + { + "epoch": 2.538344196366662, + "grad_norm": 0.35620346665382385, + "learning_rate": 5.1192434210526306e-05, + "loss": 0.4026, + "step": 2288 + }, + { + "epoch": 2.539453612536403, + "grad_norm": 0.34608957171440125, + "learning_rate": 5.106907894736842e-05, + "loss": 0.5012, + "step": 2289 + }, + { + "epoch": 2.5405630287061434, + "grad_norm": 0.3685770630836487, + "learning_rate": 5.094572368421052e-05, + "loss": 0.4459, + "step": 2290 + }, + { + "epoch": 2.541672444875884, + "grad_norm": 0.34064555168151855, + "learning_rate": 5.082236842105263e-05, + "loss": 0.206, + "step": 2291 + }, + { + "epoch": 2.5427818610456248, + "grad_norm": 0.5084649324417114, + "learning_rate": 5.069901315789473e-05, + "loss": 0.4602, + "step": 2292 + }, + { + "epoch": 2.5438912772153657, + "grad_norm": 0.4061320126056671, + "learning_rate": 5.057565789473683e-05, + "loss": 0.5329, + "step": 2293 + }, + { + "epoch": 2.545000693385106, + "grad_norm": 0.2851752042770386, + "learning_rate": 5.045230263157895e-05, + "loss": 0.2781, + "step": 2294 + }, + { + "epoch": 2.5461101095548466, + "grad_norm": 0.2984931170940399, + "learning_rate": 5.032894736842105e-05, + "loss": 0.3925, + "step": 2295 + }, + { + "epoch": 2.5472195257245875, + "grad_norm": 0.3785136342048645, + "learning_rate": 5.020559210526315e-05, + "loss": 0.2643, + "step": 2296 + }, + { + "epoch": 2.548328941894328, + "grad_norm": 0.4343215823173523, + "learning_rate": 5.008223684210526e-05, + "loss": 0.4111, + "step": 2297 + }, + { + "epoch": 2.549438358064069, + "grad_norm": 0.3174603581428528, + "learning_rate": 4.995888157894736e-05, + "loss": 0.3798, + "step": 2298 + }, + { + "epoch": 2.5505477742338094, + "grad_norm": 0.30842000246047974, + "learning_rate": 4.9835526315789474e-05, + "loss": 0.2997, + "step": 2299 + }, + { + "epoch": 2.5516571904035503, + "grad_norm": 0.42980441451072693, + "learning_rate": 4.9712171052631573e-05, + "loss": 0.3467, + "step": 2300 + }, + { + "epoch": 2.552766606573291, + "grad_norm": 0.3155680000782013, + "learning_rate": 4.958881578947368e-05, + "loss": 0.3337, + "step": 2301 + }, + { + "epoch": 2.5538760227430313, + "grad_norm": 0.38970938324928284, + "learning_rate": 4.946546052631579e-05, + "loss": 0.327, + "step": 2302 + }, + { + "epoch": 2.554985438912772, + "grad_norm": 0.43559134006500244, + "learning_rate": 4.9342105263157894e-05, + "loss": 0.5547, + "step": 2303 + }, + { + "epoch": 2.5560948550825127, + "grad_norm": 0.4793894290924072, + "learning_rate": 4.9218749999999994e-05, + "loss": 0.4121, + "step": 2304 + }, + { + "epoch": 2.5572042712522536, + "grad_norm": 0.5885838270187378, + "learning_rate": 4.90953947368421e-05, + "loss": 0.5028, + "step": 2305 + }, + { + "epoch": 2.558313687421994, + "grad_norm": 0.4064948260784149, + "learning_rate": 4.897203947368421e-05, + "loss": 0.257, + "step": 2306 + }, + { + "epoch": 2.559423103591735, + "grad_norm": 0.6676486730575562, + "learning_rate": 4.884868421052631e-05, + "loss": 0.6808, + "step": 2307 + }, + { + "epoch": 2.5605325197614754, + "grad_norm": 0.3958938419818878, + "learning_rate": 4.872532894736842e-05, + "loss": 0.4467, + "step": 2308 + }, + { + "epoch": 2.5616419359312164, + "grad_norm": 0.5410012602806091, + "learning_rate": 4.860197368421052e-05, + "loss": 0.4545, + "step": 2309 + }, + { + "epoch": 2.562751352100957, + "grad_norm": 0.31039872765541077, + "learning_rate": 4.8478618421052634e-05, + "loss": 0.4389, + "step": 2310 + }, + { + "epoch": 2.5638607682706978, + "grad_norm": 0.4607661962509155, + "learning_rate": 4.8355263157894734e-05, + "loss": 0.3213, + "step": 2311 + }, + { + "epoch": 2.5649701844404382, + "grad_norm": 0.33510398864746094, + "learning_rate": 4.8231907894736834e-05, + "loss": 0.3981, + "step": 2312 + }, + { + "epoch": 2.5660796006101787, + "grad_norm": 0.42955508828163147, + "learning_rate": 4.810855263157895e-05, + "loss": 0.2282, + "step": 2313 + }, + { + "epoch": 2.5671890167799196, + "grad_norm": 0.5348376035690308, + "learning_rate": 4.798519736842105e-05, + "loss": 0.4386, + "step": 2314 + }, + { + "epoch": 2.56829843294966, + "grad_norm": 0.4563250243663788, + "learning_rate": 4.7861842105263154e-05, + "loss": 0.3593, + "step": 2315 + }, + { + "epoch": 2.569407849119401, + "grad_norm": 0.38774073123931885, + "learning_rate": 4.773848684210526e-05, + "loss": 0.3159, + "step": 2316 + }, + { + "epoch": 2.5705172652891415, + "grad_norm": 0.33762556314468384, + "learning_rate": 4.761513157894736e-05, + "loss": 0.2853, + "step": 2317 + }, + { + "epoch": 2.5716266814588824, + "grad_norm": 0.5899233222007751, + "learning_rate": 4.749177631578947e-05, + "loss": 0.2734, + "step": 2318 + }, + { + "epoch": 2.572736097628623, + "grad_norm": 0.4161059558391571, + "learning_rate": 4.7368421052631574e-05, + "loss": 0.442, + "step": 2319 + }, + { + "epoch": 2.5738455137983633, + "grad_norm": 0.366187185049057, + "learning_rate": 4.724506578947368e-05, + "loss": 0.2767, + "step": 2320 + }, + { + "epoch": 2.5749549299681043, + "grad_norm": 0.325467586517334, + "learning_rate": 4.712171052631578e-05, + "loss": 0.4227, + "step": 2321 + }, + { + "epoch": 2.576064346137845, + "grad_norm": 0.31031447649002075, + "learning_rate": 4.6998355263157894e-05, + "loss": 0.3563, + "step": 2322 + }, + { + "epoch": 2.5771737623075857, + "grad_norm": 0.4601028263568878, + "learning_rate": 4.6874999999999994e-05, + "loss": 0.3945, + "step": 2323 + }, + { + "epoch": 2.578283178477326, + "grad_norm": 0.5773911476135254, + "learning_rate": 4.67516447368421e-05, + "loss": 0.5002, + "step": 2324 + }, + { + "epoch": 2.579392594647067, + "grad_norm": 0.3071858584880829, + "learning_rate": 4.662828947368421e-05, + "loss": 0.3305, + "step": 2325 + }, + { + "epoch": 2.5805020108168075, + "grad_norm": 0.42758241295814514, + "learning_rate": 4.650493421052631e-05, + "loss": 0.3915, + "step": 2326 + }, + { + "epoch": 2.5816114269865484, + "grad_norm": 0.4389335811138153, + "learning_rate": 4.638157894736842e-05, + "loss": 0.41, + "step": 2327 + }, + { + "epoch": 2.582720843156289, + "grad_norm": 0.4536753296852112, + "learning_rate": 4.625822368421052e-05, + "loss": 0.552, + "step": 2328 + }, + { + "epoch": 2.58383025932603, + "grad_norm": 0.4767647981643677, + "learning_rate": 4.6134868421052635e-05, + "loss": 0.2961, + "step": 2329 + }, + { + "epoch": 2.5849396754957703, + "grad_norm": 0.33526867628097534, + "learning_rate": 4.6011513157894734e-05, + "loss": 0.4272, + "step": 2330 + }, + { + "epoch": 2.586049091665511, + "grad_norm": 0.3391248285770416, + "learning_rate": 4.5888157894736834e-05, + "loss": 0.4728, + "step": 2331 + }, + { + "epoch": 2.5871585078352517, + "grad_norm": 0.44053715467453003, + "learning_rate": 4.576480263157895e-05, + "loss": 0.4216, + "step": 2332 + }, + { + "epoch": 2.5882679240049926, + "grad_norm": 0.3894062340259552, + "learning_rate": 4.564144736842105e-05, + "loss": 0.163, + "step": 2333 + }, + { + "epoch": 2.589377340174733, + "grad_norm": 0.37204012274742126, + "learning_rate": 4.5518092105263155e-05, + "loss": 0.388, + "step": 2334 + }, + { + "epoch": 2.5904867563444736, + "grad_norm": 0.3864336907863617, + "learning_rate": 4.539473684210526e-05, + "loss": 0.2925, + "step": 2335 + }, + { + "epoch": 2.5915961725142145, + "grad_norm": 0.2916948199272156, + "learning_rate": 4.527138157894736e-05, + "loss": 0.2073, + "step": 2336 + }, + { + "epoch": 2.592705588683955, + "grad_norm": 0.8837294578552246, + "learning_rate": 4.514802631578947e-05, + "loss": 0.2756, + "step": 2337 + }, + { + "epoch": 2.593815004853696, + "grad_norm": 0.41746076941490173, + "learning_rate": 4.5024671052631575e-05, + "loss": 0.4903, + "step": 2338 + }, + { + "epoch": 2.5949244210234363, + "grad_norm": 0.31884750723838806, + "learning_rate": 4.490131578947368e-05, + "loss": 0.4432, + "step": 2339 + }, + { + "epoch": 2.5960338371931773, + "grad_norm": 0.30209752917289734, + "learning_rate": 4.477796052631578e-05, + "loss": 0.3105, + "step": 2340 + }, + { + "epoch": 2.5971432533629177, + "grad_norm": 0.42073366045951843, + "learning_rate": 4.4654605263157895e-05, + "loss": 0.2862, + "step": 2341 + }, + { + "epoch": 2.598252669532658, + "grad_norm": 1.1312992572784424, + "learning_rate": 4.4531249999999995e-05, + "loss": 0.3452, + "step": 2342 + }, + { + "epoch": 2.599362085702399, + "grad_norm": 0.35556334257125854, + "learning_rate": 4.4407894736842095e-05, + "loss": 0.3874, + "step": 2343 + }, + { + "epoch": 2.60047150187214, + "grad_norm": 0.43012315034866333, + "learning_rate": 4.428453947368421e-05, + "loss": 0.3205, + "step": 2344 + }, + { + "epoch": 2.6015809180418805, + "grad_norm": 0.3909721076488495, + "learning_rate": 4.416118421052631e-05, + "loss": 0.3442, + "step": 2345 + }, + { + "epoch": 2.602690334211621, + "grad_norm": 0.4908634424209595, + "learning_rate": 4.403782894736842e-05, + "loss": 0.5702, + "step": 2346 + }, + { + "epoch": 2.603799750381362, + "grad_norm": 0.6141018867492676, + "learning_rate": 4.391447368421052e-05, + "loss": 0.4149, + "step": 2347 + }, + { + "epoch": 2.6049091665511024, + "grad_norm": 0.4573987126350403, + "learning_rate": 4.379111842105263e-05, + "loss": 0.4007, + "step": 2348 + }, + { + "epoch": 2.6060185827208433, + "grad_norm": 0.4271261692047119, + "learning_rate": 4.3667763157894735e-05, + "loss": 0.4075, + "step": 2349 + }, + { + "epoch": 2.6071279988905838, + "grad_norm": 0.39978379011154175, + "learning_rate": 4.3544407894736835e-05, + "loss": 0.4654, + "step": 2350 + }, + { + "epoch": 2.6082374150603247, + "grad_norm": 0.33403047919273376, + "learning_rate": 4.342105263157895e-05, + "loss": 0.4196, + "step": 2351 + }, + { + "epoch": 2.609346831230065, + "grad_norm": 0.5256022214889526, + "learning_rate": 4.329769736842105e-05, + "loss": 0.2752, + "step": 2352 + }, + { + "epoch": 2.6104562473998056, + "grad_norm": 0.4153258204460144, + "learning_rate": 4.3174342105263155e-05, + "loss": 0.3468, + "step": 2353 + }, + { + "epoch": 2.6115656635695466, + "grad_norm": 0.6061992645263672, + "learning_rate": 4.305098684210526e-05, + "loss": 0.351, + "step": 2354 + }, + { + "epoch": 2.6126750797392875, + "grad_norm": 0.295296847820282, + "learning_rate": 4.292763157894736e-05, + "loss": 0.2476, + "step": 2355 + }, + { + "epoch": 2.613784495909028, + "grad_norm": 0.3813928961753845, + "learning_rate": 4.280427631578947e-05, + "loss": 0.3138, + "step": 2356 + }, + { + "epoch": 2.6148939120787684, + "grad_norm": 0.3294810652732849, + "learning_rate": 4.2680921052631575e-05, + "loss": 0.3273, + "step": 2357 + }, + { + "epoch": 2.6160033282485093, + "grad_norm": 0.4694172739982605, + "learning_rate": 4.255756578947368e-05, + "loss": 0.3346, + "step": 2358 + }, + { + "epoch": 2.61711274441825, + "grad_norm": 0.5706562399864197, + "learning_rate": 4.243421052631578e-05, + "loss": 0.2241, + "step": 2359 + }, + { + "epoch": 2.6182221605879907, + "grad_norm": 0.6200342774391174, + "learning_rate": 4.2310855263157896e-05, + "loss": 0.4129, + "step": 2360 + }, + { + "epoch": 2.619331576757731, + "grad_norm": 0.4055767059326172, + "learning_rate": 4.2187499999999995e-05, + "loss": 0.4006, + "step": 2361 + }, + { + "epoch": 2.620440992927472, + "grad_norm": 0.33380985260009766, + "learning_rate": 4.2064144736842095e-05, + "loss": 0.3732, + "step": 2362 + }, + { + "epoch": 2.6215504090972126, + "grad_norm": 0.41517943143844604, + "learning_rate": 4.194078947368421e-05, + "loss": 0.2458, + "step": 2363 + }, + { + "epoch": 2.622659825266953, + "grad_norm": 0.3231419324874878, + "learning_rate": 4.181743421052631e-05, + "loss": 0.5769, + "step": 2364 + }, + { + "epoch": 2.623769241436694, + "grad_norm": 0.4171946942806244, + "learning_rate": 4.169407894736842e-05, + "loss": 0.4702, + "step": 2365 + }, + { + "epoch": 2.6248786576064345, + "grad_norm": 0.3811667263507843, + "learning_rate": 4.157072368421052e-05, + "loss": 0.3849, + "step": 2366 + }, + { + "epoch": 2.6259880737761754, + "grad_norm": 0.41539669036865234, + "learning_rate": 4.144736842105263e-05, + "loss": 0.3845, + "step": 2367 + }, + { + "epoch": 2.627097489945916, + "grad_norm": 0.39716070890426636, + "learning_rate": 4.1324013157894736e-05, + "loss": 0.3744, + "step": 2368 + }, + { + "epoch": 2.6282069061156568, + "grad_norm": 0.38069993257522583, + "learning_rate": 4.1200657894736836e-05, + "loss": 0.2432, + "step": 2369 + }, + { + "epoch": 2.6293163222853972, + "grad_norm": 0.3581462502479553, + "learning_rate": 4.107730263157894e-05, + "loss": 0.6516, + "step": 2370 + }, + { + "epoch": 2.6304257384551377, + "grad_norm": 0.34660494327545166, + "learning_rate": 4.095394736842105e-05, + "loss": 0.3599, + "step": 2371 + }, + { + "epoch": 2.6315351546248786, + "grad_norm": 0.34078100323677063, + "learning_rate": 4.0830592105263156e-05, + "loss": 0.3611, + "step": 2372 + }, + { + "epoch": 2.6326445707946196, + "grad_norm": 0.3904211223125458, + "learning_rate": 4.0707236842105256e-05, + "loss": 0.301, + "step": 2373 + }, + { + "epoch": 2.63375398696436, + "grad_norm": 0.3696337342262268, + "learning_rate": 4.058388157894736e-05, + "loss": 0.3051, + "step": 2374 + }, + { + "epoch": 2.6348634031341005, + "grad_norm": 0.43636754155158997, + "learning_rate": 4.046052631578947e-05, + "loss": 0.4971, + "step": 2375 + }, + { + "epoch": 2.6359728193038414, + "grad_norm": 0.5181596279144287, + "learning_rate": 4.0337171052631576e-05, + "loss": 0.3127, + "step": 2376 + }, + { + "epoch": 2.637082235473582, + "grad_norm": 0.5476127862930298, + "learning_rate": 4.021381578947368e-05, + "loss": 0.5049, + "step": 2377 + }, + { + "epoch": 2.638191651643323, + "grad_norm": 0.5447264909744263, + "learning_rate": 4.009046052631578e-05, + "loss": 0.5255, + "step": 2378 + }, + { + "epoch": 2.6393010678130633, + "grad_norm": 0.5430484414100647, + "learning_rate": 3.9967105263157896e-05, + "loss": 0.3371, + "step": 2379 + }, + { + "epoch": 2.640410483982804, + "grad_norm": 0.4831867516040802, + "learning_rate": 3.9843749999999996e-05, + "loss": 0.4452, + "step": 2380 + }, + { + "epoch": 2.6415199001525447, + "grad_norm": 0.6055355072021484, + "learning_rate": 3.9720394736842096e-05, + "loss": 0.5317, + "step": 2381 + }, + { + "epoch": 2.642629316322285, + "grad_norm": 0.4254622757434845, + "learning_rate": 3.959703947368421e-05, + "loss": 0.3434, + "step": 2382 + }, + { + "epoch": 2.643738732492026, + "grad_norm": 0.4942837655544281, + "learning_rate": 3.947368421052631e-05, + "loss": 0.4468, + "step": 2383 + }, + { + "epoch": 2.644848148661767, + "grad_norm": 0.3226402997970581, + "learning_rate": 3.935032894736842e-05, + "loss": 0.4929, + "step": 2384 + }, + { + "epoch": 2.6459575648315075, + "grad_norm": 0.4470663368701935, + "learning_rate": 3.922697368421052e-05, + "loss": 0.33, + "step": 2385 + }, + { + "epoch": 2.647066981001248, + "grad_norm": 0.5086075663566589, + "learning_rate": 3.910361842105263e-05, + "loss": 0.6575, + "step": 2386 + }, + { + "epoch": 2.648176397170989, + "grad_norm": 0.39487743377685547, + "learning_rate": 3.8980263157894736e-05, + "loss": 0.3375, + "step": 2387 + }, + { + "epoch": 2.6492858133407293, + "grad_norm": 0.6295050978660583, + "learning_rate": 3.8856907894736836e-05, + "loss": 0.398, + "step": 2388 + }, + { + "epoch": 2.6503952295104702, + "grad_norm": 0.5763193368911743, + "learning_rate": 3.873355263157894e-05, + "loss": 0.3891, + "step": 2389 + }, + { + "epoch": 2.6515046456802107, + "grad_norm": 0.33462652564048767, + "learning_rate": 3.861019736842105e-05, + "loss": 0.428, + "step": 2390 + }, + { + "epoch": 2.6526140618499516, + "grad_norm": 0.6703096628189087, + "learning_rate": 3.8486842105263156e-05, + "loss": 0.3148, + "step": 2391 + }, + { + "epoch": 2.653723478019692, + "grad_norm": 0.3808022141456604, + "learning_rate": 3.8363486842105256e-05, + "loss": 0.4022, + "step": 2392 + }, + { + "epoch": 2.6548328941894326, + "grad_norm": 0.42382562160491943, + "learning_rate": 3.824013157894736e-05, + "loss": 0.2387, + "step": 2393 + }, + { + "epoch": 2.6559423103591735, + "grad_norm": 0.4965885579586029, + "learning_rate": 3.811677631578947e-05, + "loss": 0.5139, + "step": 2394 + }, + { + "epoch": 2.6570517265289144, + "grad_norm": 0.4375743269920349, + "learning_rate": 3.799342105263157e-05, + "loss": 0.3922, + "step": 2395 + }, + { + "epoch": 2.658161142698655, + "grad_norm": 0.44008010625839233, + "learning_rate": 3.787006578947368e-05, + "loss": 0.8074, + "step": 2396 + }, + { + "epoch": 2.6592705588683954, + "grad_norm": 0.40815237164497375, + "learning_rate": 3.774671052631578e-05, + "loss": 0.4673, + "step": 2397 + }, + { + "epoch": 2.6603799750381363, + "grad_norm": 0.3250466287136078, + "learning_rate": 3.76233552631579e-05, + "loss": 0.3603, + "step": 2398 + }, + { + "epoch": 2.6614893912078768, + "grad_norm": 0.4003657400608063, + "learning_rate": 3.75e-05, + "loss": 0.5028, + "step": 2399 + }, + { + "epoch": 2.6625988073776177, + "grad_norm": 0.475201278924942, + "learning_rate": 3.7376644736842103e-05, + "loss": 0.2309, + "step": 2400 + }, + { + "epoch": 2.663708223547358, + "grad_norm": 0.4348124563694, + "learning_rate": 3.725328947368421e-05, + "loss": 0.3997, + "step": 2401 + }, + { + "epoch": 2.664817639717099, + "grad_norm": 0.5189718008041382, + "learning_rate": 3.712993421052632e-05, + "loss": 0.4532, + "step": 2402 + }, + { + "epoch": 2.6659270558868395, + "grad_norm": 0.293647438287735, + "learning_rate": 3.700657894736842e-05, + "loss": 0.4344, + "step": 2403 + }, + { + "epoch": 2.66703647205658, + "grad_norm": 0.4177672564983368, + "learning_rate": 3.6883223684210524e-05, + "loss": 0.4263, + "step": 2404 + }, + { + "epoch": 2.668145888226321, + "grad_norm": 0.4328800141811371, + "learning_rate": 3.675986842105263e-05, + "loss": 0.3869, + "step": 2405 + }, + { + "epoch": 2.669255304396062, + "grad_norm": 0.3846050202846527, + "learning_rate": 3.663651315789474e-05, + "loss": 0.4011, + "step": 2406 + }, + { + "epoch": 2.6703647205658023, + "grad_norm": 0.3619375228881836, + "learning_rate": 3.651315789473684e-05, + "loss": 0.2928, + "step": 2407 + }, + { + "epoch": 2.671474136735543, + "grad_norm": 0.5226150155067444, + "learning_rate": 3.6389802631578944e-05, + "loss": 0.2546, + "step": 2408 + }, + { + "epoch": 2.6725835529052837, + "grad_norm": 0.3613680303096771, + "learning_rate": 3.626644736842105e-05, + "loss": 0.3706, + "step": 2409 + }, + { + "epoch": 2.673692969075024, + "grad_norm": 0.38295650482177734, + "learning_rate": 3.614309210526315e-05, + "loss": 0.3319, + "step": 2410 + }, + { + "epoch": 2.674802385244765, + "grad_norm": 0.3644557595252991, + "learning_rate": 3.601973684210526e-05, + "loss": 0.3344, + "step": 2411 + }, + { + "epoch": 2.6759118014145056, + "grad_norm": 0.4107860028743744, + "learning_rate": 3.5896381578947364e-05, + "loss": 0.2881, + "step": 2412 + }, + { + "epoch": 2.6770212175842465, + "grad_norm": 0.39361506700515747, + "learning_rate": 3.577302631578947e-05, + "loss": 0.3575, + "step": 2413 + }, + { + "epoch": 2.678130633753987, + "grad_norm": 0.30774053931236267, + "learning_rate": 3.564967105263158e-05, + "loss": 0.2718, + "step": 2414 + }, + { + "epoch": 2.6792400499237274, + "grad_norm": 0.40254542231559753, + "learning_rate": 3.552631578947368e-05, + "loss": 0.4364, + "step": 2415 + }, + { + "epoch": 2.6803494660934684, + "grad_norm": 0.39855626225471497, + "learning_rate": 3.5402960526315784e-05, + "loss": 0.2258, + "step": 2416 + }, + { + "epoch": 2.681458882263209, + "grad_norm": 0.3063789904117584, + "learning_rate": 3.527960526315789e-05, + "loss": 0.3248, + "step": 2417 + }, + { + "epoch": 2.6825682984329497, + "grad_norm": 0.48203665018081665, + "learning_rate": 3.515625e-05, + "loss": 0.5971, + "step": 2418 + }, + { + "epoch": 2.6836777146026902, + "grad_norm": 0.40560922026634216, + "learning_rate": 3.5032894736842104e-05, + "loss": 0.2832, + "step": 2419 + }, + { + "epoch": 2.684787130772431, + "grad_norm": 0.4571300745010376, + "learning_rate": 3.490953947368421e-05, + "loss": 0.414, + "step": 2420 + }, + { + "epoch": 2.6858965469421716, + "grad_norm": 0.29357972741127014, + "learning_rate": 3.478618421052632e-05, + "loss": 0.4382, + "step": 2421 + }, + { + "epoch": 2.687005963111912, + "grad_norm": 0.4692830741405487, + "learning_rate": 3.466282894736842e-05, + "loss": 0.4557, + "step": 2422 + }, + { + "epoch": 2.688115379281653, + "grad_norm": 0.5453107953071594, + "learning_rate": 3.4539473684210524e-05, + "loss": 0.456, + "step": 2423 + }, + { + "epoch": 2.689224795451394, + "grad_norm": 0.8587030172348022, + "learning_rate": 3.441611842105263e-05, + "loss": 0.5188, + "step": 2424 + }, + { + "epoch": 2.6903342116211344, + "grad_norm": 0.5260380506515503, + "learning_rate": 3.429276315789474e-05, + "loss": 0.4475, + "step": 2425 + }, + { + "epoch": 2.691443627790875, + "grad_norm": 0.2895418405532837, + "learning_rate": 3.416940789473684e-05, + "loss": 0.245, + "step": 2426 + }, + { + "epoch": 2.692553043960616, + "grad_norm": 0.44756054878234863, + "learning_rate": 3.4046052631578944e-05, + "loss": 0.4375, + "step": 2427 + }, + { + "epoch": 2.6936624601303563, + "grad_norm": 0.39095133543014526, + "learning_rate": 3.392269736842105e-05, + "loss": 0.405, + "step": 2428 + }, + { + "epoch": 2.694771876300097, + "grad_norm": 0.6433843374252319, + "learning_rate": 3.379934210526315e-05, + "loss": 0.2905, + "step": 2429 + }, + { + "epoch": 2.6958812924698377, + "grad_norm": 0.5514675974845886, + "learning_rate": 3.367598684210526e-05, + "loss": 0.3651, + "step": 2430 + }, + { + "epoch": 2.6969907086395786, + "grad_norm": 0.4037233293056488, + "learning_rate": 3.3552631578947364e-05, + "loss": 0.5534, + "step": 2431 + }, + { + "epoch": 2.698100124809319, + "grad_norm": 0.3961438238620758, + "learning_rate": 3.342927631578947e-05, + "loss": 0.4399, + "step": 2432 + }, + { + "epoch": 2.6992095409790595, + "grad_norm": 0.38111430406570435, + "learning_rate": 3.330592105263158e-05, + "loss": 0.3532, + "step": 2433 + }, + { + "epoch": 2.7003189571488004, + "grad_norm": 0.35400980710983276, + "learning_rate": 3.318256578947368e-05, + "loss": 0.5905, + "step": 2434 + }, + { + "epoch": 2.7014283733185414, + "grad_norm": 0.469614714384079, + "learning_rate": 3.3059210526315785e-05, + "loss": 0.4461, + "step": 2435 + }, + { + "epoch": 2.702537789488282, + "grad_norm": 0.2964523732662201, + "learning_rate": 3.293585526315789e-05, + "loss": 0.3752, + "step": 2436 + }, + { + "epoch": 2.7036472056580223, + "grad_norm": 0.4390257000923157, + "learning_rate": 3.28125e-05, + "loss": 0.384, + "step": 2437 + }, + { + "epoch": 2.704756621827763, + "grad_norm": 0.6685402393341064, + "learning_rate": 3.2689144736842105e-05, + "loss": 0.3686, + "step": 2438 + }, + { + "epoch": 2.7058660379975037, + "grad_norm": 0.42853912711143494, + "learning_rate": 3.256578947368421e-05, + "loss": 0.3165, + "step": 2439 + }, + { + "epoch": 2.7069754541672446, + "grad_norm": 0.382541686296463, + "learning_rate": 3.244243421052632e-05, + "loss": 0.4957, + "step": 2440 + }, + { + "epoch": 2.708084870336985, + "grad_norm": 0.5440720319747925, + "learning_rate": 3.231907894736842e-05, + "loss": 0.4417, + "step": 2441 + }, + { + "epoch": 2.709194286506726, + "grad_norm": 0.4604506492614746, + "learning_rate": 3.2195723684210525e-05, + "loss": 0.4257, + "step": 2442 + }, + { + "epoch": 2.7103037026764665, + "grad_norm": 0.45395079255104065, + "learning_rate": 3.207236842105263e-05, + "loss": 0.2593, + "step": 2443 + }, + { + "epoch": 2.711413118846207, + "grad_norm": 0.4349961578845978, + "learning_rate": 3.194901315789473e-05, + "loss": 0.3596, + "step": 2444 + }, + { + "epoch": 2.712522535015948, + "grad_norm": 0.46334758400917053, + "learning_rate": 3.182565789473684e-05, + "loss": 0.3347, + "step": 2445 + }, + { + "epoch": 2.713631951185689, + "grad_norm": 0.4385397136211395, + "learning_rate": 3.1702302631578945e-05, + "loss": 0.1988, + "step": 2446 + }, + { + "epoch": 2.7147413673554293, + "grad_norm": 0.6983950734138489, + "learning_rate": 3.1578947368421045e-05, + "loss": 0.3902, + "step": 2447 + }, + { + "epoch": 2.7158507835251697, + "grad_norm": 0.46519941091537476, + "learning_rate": 3.145559210526315e-05, + "loss": 0.9286, + "step": 2448 + }, + { + "epoch": 2.7169601996949106, + "grad_norm": 0.4739225506782532, + "learning_rate": 3.133223684210526e-05, + "loss": 0.4028, + "step": 2449 + }, + { + "epoch": 2.718069615864651, + "grad_norm": 0.45045381784439087, + "learning_rate": 3.1208881578947365e-05, + "loss": 0.2989, + "step": 2450 + }, + { + "epoch": 2.719179032034392, + "grad_norm": 0.5387830138206482, + "learning_rate": 3.108552631578947e-05, + "loss": 0.3876, + "step": 2451 + }, + { + "epoch": 2.7202884482041325, + "grad_norm": 0.4015057682991028, + "learning_rate": 3.096217105263158e-05, + "loss": 0.3363, + "step": 2452 + }, + { + "epoch": 2.7213978643738734, + "grad_norm": 0.9120666980743408, + "learning_rate": 3.083881578947368e-05, + "loss": 0.3437, + "step": 2453 + }, + { + "epoch": 2.722507280543614, + "grad_norm": 0.3784855902194977, + "learning_rate": 3.0715460526315785e-05, + "loss": 0.3015, + "step": 2454 + }, + { + "epoch": 2.7236166967133544, + "grad_norm": 0.3855441212654114, + "learning_rate": 3.059210526315789e-05, + "loss": 0.3813, + "step": 2455 + }, + { + "epoch": 2.7247261128830953, + "grad_norm": 0.8900882601737976, + "learning_rate": 3.046875e-05, + "loss": 0.6206, + "step": 2456 + }, + { + "epoch": 2.725835529052836, + "grad_norm": 0.43435898423194885, + "learning_rate": 3.0345394736842102e-05, + "loss": 0.2936, + "step": 2457 + }, + { + "epoch": 2.7269449452225767, + "grad_norm": 0.4391900300979614, + "learning_rate": 3.022203947368421e-05, + "loss": 0.6765, + "step": 2458 + }, + { + "epoch": 2.728054361392317, + "grad_norm": 0.566494882106781, + "learning_rate": 3.0098684210526315e-05, + "loss": 0.3304, + "step": 2459 + }, + { + "epoch": 2.729163777562058, + "grad_norm": 0.35425856709480286, + "learning_rate": 2.9975328947368415e-05, + "loss": 0.3772, + "step": 2460 + }, + { + "epoch": 2.7302731937317986, + "grad_norm": 0.3330824673175812, + "learning_rate": 2.9851973684210522e-05, + "loss": 0.3451, + "step": 2461 + }, + { + "epoch": 2.7313826099015395, + "grad_norm": 0.5032577514648438, + "learning_rate": 2.972861842105263e-05, + "loss": 0.4958, + "step": 2462 + }, + { + "epoch": 2.73249202607128, + "grad_norm": 0.46107593178749084, + "learning_rate": 2.9605263157894735e-05, + "loss": 0.5935, + "step": 2463 + }, + { + "epoch": 2.733601442241021, + "grad_norm": 0.4279358983039856, + "learning_rate": 2.9481907894736842e-05, + "loss": 0.352, + "step": 2464 + }, + { + "epoch": 2.7347108584107613, + "grad_norm": 0.35816699266433716, + "learning_rate": 2.9358552631578946e-05, + "loss": 0.4058, + "step": 2465 + }, + { + "epoch": 2.735820274580502, + "grad_norm": 0.3129696249961853, + "learning_rate": 2.923519736842105e-05, + "loss": 0.2704, + "step": 2466 + }, + { + "epoch": 2.7369296907502427, + "grad_norm": 0.5344390869140625, + "learning_rate": 2.9111842105263156e-05, + "loss": 0.3926, + "step": 2467 + }, + { + "epoch": 2.738039106919983, + "grad_norm": 0.3743567168712616, + "learning_rate": 2.898848684210526e-05, + "loss": 0.3455, + "step": 2468 + }, + { + "epoch": 2.739148523089724, + "grad_norm": 0.4825611412525177, + "learning_rate": 2.8865131578947366e-05, + "loss": 0.4537, + "step": 2469 + }, + { + "epoch": 2.7402579392594646, + "grad_norm": 0.47796013951301575, + "learning_rate": 2.8741776315789472e-05, + "loss": 0.4268, + "step": 2470 + }, + { + "epoch": 2.7413673554292055, + "grad_norm": 0.38644805550575256, + "learning_rate": 2.861842105263158e-05, + "loss": 0.3989, + "step": 2471 + }, + { + "epoch": 2.742476771598946, + "grad_norm": 0.41950738430023193, + "learning_rate": 2.849506578947368e-05, + "loss": 0.4983, + "step": 2472 + }, + { + "epoch": 2.7435861877686865, + "grad_norm": 0.34149548411369324, + "learning_rate": 2.8371710526315786e-05, + "loss": 0.4519, + "step": 2473 + }, + { + "epoch": 2.7446956039384274, + "grad_norm": 0.4213709831237793, + "learning_rate": 2.8248355263157893e-05, + "loss": 0.4212, + "step": 2474 + }, + { + "epoch": 2.7458050201081683, + "grad_norm": 0.3141988217830658, + "learning_rate": 2.8125e-05, + "loss": 0.3161, + "step": 2475 + }, + { + "epoch": 2.7469144362779088, + "grad_norm": 0.3500676155090332, + "learning_rate": 2.8001644736842103e-05, + "loss": 0.3603, + "step": 2476 + }, + { + "epoch": 2.7480238524476492, + "grad_norm": 0.4086303114891052, + "learning_rate": 2.787828947368421e-05, + "loss": 0.5926, + "step": 2477 + }, + { + "epoch": 2.74913326861739, + "grad_norm": 0.5967698097229004, + "learning_rate": 2.7754934210526316e-05, + "loss": 0.237, + "step": 2478 + }, + { + "epoch": 2.7502426847871306, + "grad_norm": 0.4627840220928192, + "learning_rate": 2.7631578947368416e-05, + "loss": 0.3304, + "step": 2479 + }, + { + "epoch": 2.7513521009568715, + "grad_norm": 0.37266799807548523, + "learning_rate": 2.7508223684210523e-05, + "loss": 0.5739, + "step": 2480 + }, + { + "epoch": 2.752461517126612, + "grad_norm": 0.42507204413414, + "learning_rate": 2.738486842105263e-05, + "loss": 0.4707, + "step": 2481 + }, + { + "epoch": 2.753570933296353, + "grad_norm": 0.4577075242996216, + "learning_rate": 2.7261513157894736e-05, + "loss": 0.5197, + "step": 2482 + }, + { + "epoch": 2.7546803494660934, + "grad_norm": 0.6312010884284973, + "learning_rate": 2.713815789473684e-05, + "loss": 0.5129, + "step": 2483 + }, + { + "epoch": 2.755789765635834, + "grad_norm": 0.5095051527023315, + "learning_rate": 2.7014802631578946e-05, + "loss": 0.329, + "step": 2484 + }, + { + "epoch": 2.756899181805575, + "grad_norm": 0.5552304983139038, + "learning_rate": 2.689144736842105e-05, + "loss": 0.516, + "step": 2485 + }, + { + "epoch": 2.7580085979753157, + "grad_norm": 0.4446256458759308, + "learning_rate": 2.6768092105263153e-05, + "loss": 0.4809, + "step": 2486 + }, + { + "epoch": 2.759118014145056, + "grad_norm": 0.45802775025367737, + "learning_rate": 2.664473684210526e-05, + "loss": 0.2507, + "step": 2487 + }, + { + "epoch": 2.7602274303147967, + "grad_norm": 1.002074956893921, + "learning_rate": 2.6521381578947366e-05, + "loss": 0.3177, + "step": 2488 + }, + { + "epoch": 2.7613368464845376, + "grad_norm": 0.37677767872810364, + "learning_rate": 2.6398026315789473e-05, + "loss": 0.3994, + "step": 2489 + }, + { + "epoch": 2.762446262654278, + "grad_norm": 0.5203359723091125, + "learning_rate": 2.627467105263158e-05, + "loss": 0.3421, + "step": 2490 + }, + { + "epoch": 2.763555678824019, + "grad_norm": 0.41637536883354187, + "learning_rate": 2.615131578947368e-05, + "loss": 0.4927, + "step": 2491 + }, + { + "epoch": 2.7646650949937595, + "grad_norm": 0.44756412506103516, + "learning_rate": 2.6027960526315786e-05, + "loss": 0.3734, + "step": 2492 + }, + { + "epoch": 2.7657745111635004, + "grad_norm": 0.3564557731151581, + "learning_rate": 2.5904605263157893e-05, + "loss": 0.3779, + "step": 2493 + }, + { + "epoch": 2.766883927333241, + "grad_norm": 0.3852544128894806, + "learning_rate": 2.5781249999999996e-05, + "loss": 0.6018, + "step": 2494 + }, + { + "epoch": 2.7679933435029813, + "grad_norm": 0.3930635452270508, + "learning_rate": 2.5657894736842103e-05, + "loss": 0.5023, + "step": 2495 + }, + { + "epoch": 2.7691027596727222, + "grad_norm": 0.34579432010650635, + "learning_rate": 2.553453947368421e-05, + "loss": 0.5322, + "step": 2496 + }, + { + "epoch": 2.770212175842463, + "grad_norm": 0.38263997435569763, + "learning_rate": 2.5411184210526317e-05, + "loss": 0.3458, + "step": 2497 + }, + { + "epoch": 2.7713215920122036, + "grad_norm": 0.34970882534980774, + "learning_rate": 2.5287828947368417e-05, + "loss": 0.4036, + "step": 2498 + }, + { + "epoch": 2.772431008181944, + "grad_norm": 0.36101511120796204, + "learning_rate": 2.5164473684210523e-05, + "loss": 0.2912, + "step": 2499 + }, + { + "epoch": 2.773540424351685, + "grad_norm": 0.39122408628463745, + "learning_rate": 2.504111842105263e-05, + "loss": 0.3174, + "step": 2500 + }, + { + "epoch": 2.7746498405214255, + "grad_norm": 0.3180815875530243, + "learning_rate": 2.4917763157894737e-05, + "loss": 0.4215, + "step": 2501 + }, + { + "epoch": 2.7757592566911664, + "grad_norm": 0.4079410433769226, + "learning_rate": 2.479440789473684e-05, + "loss": 0.5662, + "step": 2502 + }, + { + "epoch": 2.776868672860907, + "grad_norm": 0.35038089752197266, + "learning_rate": 2.4671052631578947e-05, + "loss": 0.4216, + "step": 2503 + }, + { + "epoch": 2.777978089030648, + "grad_norm": 0.34112969040870667, + "learning_rate": 2.454769736842105e-05, + "loss": 0.3627, + "step": 2504 + }, + { + "epoch": 2.7790875052003883, + "grad_norm": 0.38411781191825867, + "learning_rate": 2.4424342105263153e-05, + "loss": 0.3333, + "step": 2505 + }, + { + "epoch": 2.7801969213701287, + "grad_norm": 0.4743475615978241, + "learning_rate": 2.430098684210526e-05, + "loss": 0.4561, + "step": 2506 + }, + { + "epoch": 2.7813063375398697, + "grad_norm": 0.3875446915626526, + "learning_rate": 2.4177631578947367e-05, + "loss": 0.5246, + "step": 2507 + }, + { + "epoch": 2.7824157537096106, + "grad_norm": 0.3834385573863983, + "learning_rate": 2.4054276315789474e-05, + "loss": 0.4338, + "step": 2508 + }, + { + "epoch": 2.783525169879351, + "grad_norm": 0.4199206233024597, + "learning_rate": 2.3930921052631577e-05, + "loss": 0.3568, + "step": 2509 + }, + { + "epoch": 2.7846345860490915, + "grad_norm": 0.3700524568557739, + "learning_rate": 2.380756578947368e-05, + "loss": 0.4707, + "step": 2510 + }, + { + "epoch": 2.7857440022188324, + "grad_norm": 0.3416059911251068, + "learning_rate": 2.3684210526315787e-05, + "loss": 0.3664, + "step": 2511 + }, + { + "epoch": 2.786853418388573, + "grad_norm": 0.39632654190063477, + "learning_rate": 2.356085526315789e-05, + "loss": 0.5128, + "step": 2512 + }, + { + "epoch": 2.787962834558314, + "grad_norm": 0.5743038654327393, + "learning_rate": 2.3437499999999997e-05, + "loss": 0.4739, + "step": 2513 + }, + { + "epoch": 2.7890722507280543, + "grad_norm": 0.3902910649776459, + "learning_rate": 2.3314144736842104e-05, + "loss": 0.4639, + "step": 2514 + }, + { + "epoch": 2.7901816668977952, + "grad_norm": 0.3890102803707123, + "learning_rate": 2.319078947368421e-05, + "loss": 0.4134, + "step": 2515 + }, + { + "epoch": 2.7912910830675357, + "grad_norm": 0.5704060196876526, + "learning_rate": 2.3067434210526317e-05, + "loss": 0.4693, + "step": 2516 + }, + { + "epoch": 2.792400499237276, + "grad_norm": 0.37423625588417053, + "learning_rate": 2.2944078947368417e-05, + "loss": 0.3357, + "step": 2517 + }, + { + "epoch": 2.793509915407017, + "grad_norm": 0.33532601594924927, + "learning_rate": 2.2820723684210524e-05, + "loss": 0.4696, + "step": 2518 + }, + { + "epoch": 2.7946193315767576, + "grad_norm": 0.3831970989704132, + "learning_rate": 2.269736842105263e-05, + "loss": 0.3145, + "step": 2519 + }, + { + "epoch": 2.7957287477464985, + "grad_norm": 0.312404602766037, + "learning_rate": 2.2574013157894734e-05, + "loss": 0.408, + "step": 2520 + }, + { + "epoch": 2.796838163916239, + "grad_norm": 0.396106094121933, + "learning_rate": 2.245065789473684e-05, + "loss": 0.33, + "step": 2521 + }, + { + "epoch": 2.79794758008598, + "grad_norm": 0.3440202474594116, + "learning_rate": 2.2327302631578947e-05, + "loss": 0.3087, + "step": 2522 + }, + { + "epoch": 2.7990569962557204, + "grad_norm": 0.3287065625190735, + "learning_rate": 2.2203947368421047e-05, + "loss": 0.2821, + "step": 2523 + }, + { + "epoch": 2.800166412425461, + "grad_norm": 0.5490508675575256, + "learning_rate": 2.2080592105263154e-05, + "loss": 0.4684, + "step": 2524 + }, + { + "epoch": 2.8012758285952017, + "grad_norm": 0.560986340045929, + "learning_rate": 2.195723684210526e-05, + "loss": 0.2953, + "step": 2525 + }, + { + "epoch": 2.8023852447649427, + "grad_norm": 0.4068681001663208, + "learning_rate": 2.1833881578947368e-05, + "loss": 0.4226, + "step": 2526 + }, + { + "epoch": 2.803494660934683, + "grad_norm": 0.3921424448490143, + "learning_rate": 2.1710526315789474e-05, + "loss": 0.3098, + "step": 2527 + }, + { + "epoch": 2.8046040771044236, + "grad_norm": 0.40971678495407104, + "learning_rate": 2.1587171052631578e-05, + "loss": 0.3498, + "step": 2528 + }, + { + "epoch": 2.8057134932741645, + "grad_norm": 0.2933211326599121, + "learning_rate": 2.146381578947368e-05, + "loss": 0.4814, + "step": 2529 + }, + { + "epoch": 2.806822909443905, + "grad_norm": 0.5020395517349243, + "learning_rate": 2.1340460526315788e-05, + "loss": 0.4159, + "step": 2530 + }, + { + "epoch": 2.807932325613646, + "grad_norm": 0.3937400281429291, + "learning_rate": 2.121710526315789e-05, + "loss": 0.3239, + "step": 2531 + }, + { + "epoch": 2.8090417417833864, + "grad_norm": 0.47247281670570374, + "learning_rate": 2.1093749999999998e-05, + "loss": 0.4367, + "step": 2532 + }, + { + "epoch": 2.8101511579531273, + "grad_norm": 0.44965869188308716, + "learning_rate": 2.0970394736842104e-05, + "loss": 0.3051, + "step": 2533 + }, + { + "epoch": 2.811260574122868, + "grad_norm": 0.5918648838996887, + "learning_rate": 2.084703947368421e-05, + "loss": 0.3252, + "step": 2534 + }, + { + "epoch": 2.8123699902926083, + "grad_norm": 0.4035295844078064, + "learning_rate": 2.0723684210526315e-05, + "loss": 0.3419, + "step": 2535 + }, + { + "epoch": 2.813479406462349, + "grad_norm": 0.43882495164871216, + "learning_rate": 2.0600328947368418e-05, + "loss": 0.3963, + "step": 2536 + }, + { + "epoch": 2.81458882263209, + "grad_norm": 0.3037029206752777, + "learning_rate": 2.0476973684210525e-05, + "loss": 0.2939, + "step": 2537 + }, + { + "epoch": 2.8156982388018306, + "grad_norm": 0.42646703124046326, + "learning_rate": 2.0353618421052628e-05, + "loss": 0.376, + "step": 2538 + }, + { + "epoch": 2.816807654971571, + "grad_norm": 0.3901432156562805, + "learning_rate": 2.0230263157894735e-05, + "loss": 0.3588, + "step": 2539 + }, + { + "epoch": 2.817917071141312, + "grad_norm": 0.4185813367366791, + "learning_rate": 2.010690789473684e-05, + "loss": 0.4512, + "step": 2540 + }, + { + "epoch": 2.8190264873110524, + "grad_norm": 0.46253278851509094, + "learning_rate": 1.9983552631578948e-05, + "loss": 0.3559, + "step": 2541 + }, + { + "epoch": 2.8201359034807933, + "grad_norm": 0.49084243178367615, + "learning_rate": 1.9860197368421048e-05, + "loss": 0.4087, + "step": 2542 + }, + { + "epoch": 2.821245319650534, + "grad_norm": 0.4082900285720825, + "learning_rate": 1.9736842105263155e-05, + "loss": 0.3204, + "step": 2543 + }, + { + "epoch": 2.8223547358202747, + "grad_norm": 0.4727761149406433, + "learning_rate": 1.961348684210526e-05, + "loss": 0.5809, + "step": 2544 + }, + { + "epoch": 2.823464151990015, + "grad_norm": 0.37872323393821716, + "learning_rate": 1.9490131578947368e-05, + "loss": 0.2695, + "step": 2545 + }, + { + "epoch": 2.8245735681597557, + "grad_norm": 0.4297656714916229, + "learning_rate": 1.936677631578947e-05, + "loss": 0.2949, + "step": 2546 + }, + { + "epoch": 2.8256829843294966, + "grad_norm": 0.5141124725341797, + "learning_rate": 1.9243421052631578e-05, + "loss": 0.475, + "step": 2547 + }, + { + "epoch": 2.8267924004992375, + "grad_norm": 0.4390595853328705, + "learning_rate": 1.912006578947368e-05, + "loss": 0.4907, + "step": 2548 + }, + { + "epoch": 2.827901816668978, + "grad_norm": 0.5395392179489136, + "learning_rate": 1.8996710526315785e-05, + "loss": 0.3074, + "step": 2549 + }, + { + "epoch": 2.8290112328387185, + "grad_norm": 0.5195388197898865, + "learning_rate": 1.887335526315789e-05, + "loss": 0.3901, + "step": 2550 + }, + { + "epoch": 2.8301206490084594, + "grad_norm": 0.5643588304519653, + "learning_rate": 1.875e-05, + "loss": 0.528, + "step": 2551 + }, + { + "epoch": 2.8312300651782, + "grad_norm": 0.6140674352645874, + "learning_rate": 1.8626644736842105e-05, + "loss": 0.404, + "step": 2552 + }, + { + "epoch": 2.832339481347941, + "grad_norm": 1.3323380947113037, + "learning_rate": 1.850328947368421e-05, + "loss": 0.1839, + "step": 2553 + }, + { + "epoch": 2.8334488975176813, + "grad_norm": 0.3624992072582245, + "learning_rate": 1.8379934210526315e-05, + "loss": 0.5518, + "step": 2554 + }, + { + "epoch": 2.834558313687422, + "grad_norm": 0.493804395198822, + "learning_rate": 1.825657894736842e-05, + "loss": 0.2108, + "step": 2555 + }, + { + "epoch": 2.8356677298571626, + "grad_norm": 0.41342535614967346, + "learning_rate": 1.8133223684210525e-05, + "loss": 0.5485, + "step": 2556 + }, + { + "epoch": 2.836777146026903, + "grad_norm": 0.37890028953552246, + "learning_rate": 1.800986842105263e-05, + "loss": 0.2336, + "step": 2557 + }, + { + "epoch": 2.837886562196644, + "grad_norm": 0.5276500582695007, + "learning_rate": 1.7886513157894735e-05, + "loss": 0.3295, + "step": 2558 + }, + { + "epoch": 2.838995978366385, + "grad_norm": 0.8836992979049683, + "learning_rate": 1.776315789473684e-05, + "loss": 0.4195, + "step": 2559 + }, + { + "epoch": 2.8401053945361254, + "grad_norm": 0.36474689841270447, + "learning_rate": 1.7639802631578945e-05, + "loss": 0.5335, + "step": 2560 + }, + { + "epoch": 2.841214810705866, + "grad_norm": 0.3125799298286438, + "learning_rate": 1.7516447368421052e-05, + "loss": 0.3293, + "step": 2561 + }, + { + "epoch": 2.842324226875607, + "grad_norm": 0.4086284041404724, + "learning_rate": 1.739309210526316e-05, + "loss": 0.8532, + "step": 2562 + }, + { + "epoch": 2.8434336430453473, + "grad_norm": 0.5705865025520325, + "learning_rate": 1.7269736842105262e-05, + "loss": 0.7003, + "step": 2563 + }, + { + "epoch": 2.844543059215088, + "grad_norm": 0.3946170508861542, + "learning_rate": 1.714638157894737e-05, + "loss": 0.3741, + "step": 2564 + }, + { + "epoch": 2.8456524753848287, + "grad_norm": 0.5471513271331787, + "learning_rate": 1.7023026315789472e-05, + "loss": 0.3288, + "step": 2565 + }, + { + "epoch": 2.8467618915545696, + "grad_norm": 0.43899303674697876, + "learning_rate": 1.6899671052631575e-05, + "loss": 0.5189, + "step": 2566 + }, + { + "epoch": 2.84787130772431, + "grad_norm": 0.6397573351860046, + "learning_rate": 1.6776315789473682e-05, + "loss": 0.3055, + "step": 2567 + }, + { + "epoch": 2.8489807238940505, + "grad_norm": 0.3993845582008362, + "learning_rate": 1.665296052631579e-05, + "loss": 0.3922, + "step": 2568 + }, + { + "epoch": 2.8500901400637915, + "grad_norm": 0.27621200680732727, + "learning_rate": 1.6529605263157892e-05, + "loss": 0.3626, + "step": 2569 + }, + { + "epoch": 2.851199556233532, + "grad_norm": 0.721236526966095, + "learning_rate": 1.640625e-05, + "loss": 0.5003, + "step": 2570 + }, + { + "epoch": 2.852308972403273, + "grad_norm": 0.46138882637023926, + "learning_rate": 1.6282894736842106e-05, + "loss": 0.3559, + "step": 2571 + }, + { + "epoch": 2.8534183885730133, + "grad_norm": 0.3132235109806061, + "learning_rate": 1.615953947368421e-05, + "loss": 0.3119, + "step": 2572 + }, + { + "epoch": 2.8545278047427542, + "grad_norm": 0.4404788911342621, + "learning_rate": 1.6036184210526316e-05, + "loss": 0.3208, + "step": 2573 + }, + { + "epoch": 2.8556372209124947, + "grad_norm": 0.5489991307258606, + "learning_rate": 1.591282894736842e-05, + "loss": 0.3501, + "step": 2574 + }, + { + "epoch": 2.856746637082235, + "grad_norm": 0.5425270199775696, + "learning_rate": 1.5789473684210522e-05, + "loss": 0.3652, + "step": 2575 + }, + { + "epoch": 2.857856053251976, + "grad_norm": 0.3924858570098877, + "learning_rate": 1.566611842105263e-05, + "loss": 0.3799, + "step": 2576 + }, + { + "epoch": 2.858965469421717, + "grad_norm": 0.381849080324173, + "learning_rate": 1.5542763157894736e-05, + "loss": 0.4581, + "step": 2577 + }, + { + "epoch": 2.8600748855914575, + "grad_norm": 0.38210979104042053, + "learning_rate": 1.541940789473684e-05, + "loss": 0.4339, + "step": 2578 + }, + { + "epoch": 2.861184301761198, + "grad_norm": 0.49273112416267395, + "learning_rate": 1.5296052631578946e-05, + "loss": 0.3847, + "step": 2579 + }, + { + "epoch": 2.862293717930939, + "grad_norm": 0.45563986897468567, + "learning_rate": 1.5172697368421051e-05, + "loss": 0.3684, + "step": 2580 + }, + { + "epoch": 2.8634031341006794, + "grad_norm": 0.5015951991081238, + "learning_rate": 1.5049342105263158e-05, + "loss": 0.5179, + "step": 2581 + }, + { + "epoch": 2.8645125502704203, + "grad_norm": 0.6307615041732788, + "learning_rate": 1.4925986842105261e-05, + "loss": 0.486, + "step": 2582 + }, + { + "epoch": 2.8656219664401608, + "grad_norm": 0.36506178975105286, + "learning_rate": 1.4802631578947368e-05, + "loss": 0.5584, + "step": 2583 + }, + { + "epoch": 2.8667313826099017, + "grad_norm": 0.39168620109558105, + "learning_rate": 1.4679276315789473e-05, + "loss": 0.3033, + "step": 2584 + }, + { + "epoch": 2.867840798779642, + "grad_norm": 0.5694892406463623, + "learning_rate": 1.4555921052631578e-05, + "loss": 0.4762, + "step": 2585 + }, + { + "epoch": 2.8689502149493826, + "grad_norm": 0.3743266463279724, + "learning_rate": 1.4432565789473683e-05, + "loss": 0.2169, + "step": 2586 + }, + { + "epoch": 2.8700596311191235, + "grad_norm": 0.32552504539489746, + "learning_rate": 1.430921052631579e-05, + "loss": 0.5623, + "step": 2587 + }, + { + "epoch": 2.8711690472888645, + "grad_norm": 0.4949817359447479, + "learning_rate": 1.4185855263157893e-05, + "loss": 0.5808, + "step": 2588 + }, + { + "epoch": 2.872278463458605, + "grad_norm": 0.5175449848175049, + "learning_rate": 1.40625e-05, + "loss": 0.4294, + "step": 2589 + }, + { + "epoch": 2.8733878796283454, + "grad_norm": 0.4763440191745758, + "learning_rate": 1.3939144736842105e-05, + "loss": 0.3579, + "step": 2590 + }, + { + "epoch": 2.8744972957980863, + "grad_norm": 0.3480813205242157, + "learning_rate": 1.3815789473684208e-05, + "loss": 0.3258, + "step": 2591 + }, + { + "epoch": 2.875606711967827, + "grad_norm": 0.33044132590293884, + "learning_rate": 1.3692434210526315e-05, + "loss": 0.4225, + "step": 2592 + }, + { + "epoch": 2.8767161281375677, + "grad_norm": 0.4161834418773651, + "learning_rate": 1.356907894736842e-05, + "loss": 0.3972, + "step": 2593 + }, + { + "epoch": 2.877825544307308, + "grad_norm": 0.42129024863243103, + "learning_rate": 1.3445723684210525e-05, + "loss": 0.4956, + "step": 2594 + }, + { + "epoch": 2.878934960477049, + "grad_norm": 0.3572520613670349, + "learning_rate": 1.332236842105263e-05, + "loss": 0.7549, + "step": 2595 + }, + { + "epoch": 2.8800443766467896, + "grad_norm": 0.3236415684223175, + "learning_rate": 1.3199013157894737e-05, + "loss": 0.7084, + "step": 2596 + }, + { + "epoch": 2.88115379281653, + "grad_norm": 0.42282864451408386, + "learning_rate": 1.307565789473684e-05, + "loss": 0.439, + "step": 2597 + }, + { + "epoch": 2.882263208986271, + "grad_norm": 0.3626263737678528, + "learning_rate": 1.2952302631578947e-05, + "loss": 0.36, + "step": 2598 + }, + { + "epoch": 2.883372625156012, + "grad_norm": 0.4625903367996216, + "learning_rate": 1.2828947368421052e-05, + "loss": 0.3951, + "step": 2599 + }, + { + "epoch": 2.8844820413257524, + "grad_norm": 0.30453893542289734, + "learning_rate": 1.2705592105263158e-05, + "loss": 0.3333, + "step": 2600 + }, + { + "epoch": 2.885591457495493, + "grad_norm": 0.43295082449913025, + "learning_rate": 1.2582236842105262e-05, + "loss": 0.4729, + "step": 2601 + }, + { + "epoch": 2.8867008736652338, + "grad_norm": 0.3720959424972534, + "learning_rate": 1.2458881578947368e-05, + "loss": 0.3683, + "step": 2602 + }, + { + "epoch": 2.8878102898349742, + "grad_norm": 0.2515357434749603, + "learning_rate": 1.2335526315789473e-05, + "loss": 0.2588, + "step": 2603 + }, + { + "epoch": 2.888919706004715, + "grad_norm": 0.41880831122398376, + "learning_rate": 1.2212171052631577e-05, + "loss": 0.4133, + "step": 2604 + }, + { + "epoch": 2.8900291221744556, + "grad_norm": 0.4307885766029358, + "learning_rate": 1.2088815789473683e-05, + "loss": 0.3466, + "step": 2605 + }, + { + "epoch": 2.8911385383441965, + "grad_norm": 0.5042724609375, + "learning_rate": 1.1965460526315788e-05, + "loss": 0.3241, + "step": 2606 + }, + { + "epoch": 2.892247954513937, + "grad_norm": 0.4066576063632965, + "learning_rate": 1.1842105263157894e-05, + "loss": 0.4199, + "step": 2607 + }, + { + "epoch": 2.8933573706836775, + "grad_norm": 0.43350422382354736, + "learning_rate": 1.1718749999999999e-05, + "loss": 0.4903, + "step": 2608 + }, + { + "epoch": 2.8944667868534184, + "grad_norm": 0.5719215273857117, + "learning_rate": 1.1595394736842105e-05, + "loss": 0.4011, + "step": 2609 + }, + { + "epoch": 2.8955762030231593, + "grad_norm": 0.44598355889320374, + "learning_rate": 1.1472039473684209e-05, + "loss": 0.3502, + "step": 2610 + }, + { + "epoch": 2.8966856191929, + "grad_norm": 0.3669666051864624, + "learning_rate": 1.1348684210526315e-05, + "loss": 0.5565, + "step": 2611 + }, + { + "epoch": 2.8977950353626403, + "grad_norm": 0.3861677348613739, + "learning_rate": 1.122532894736842e-05, + "loss": 0.4093, + "step": 2612 + }, + { + "epoch": 2.898904451532381, + "grad_norm": 0.3836744427680969, + "learning_rate": 1.1101973684210524e-05, + "loss": 0.5552, + "step": 2613 + }, + { + "epoch": 2.9000138677021217, + "grad_norm": 0.5049236416816711, + "learning_rate": 1.097861842105263e-05, + "loss": 0.3947, + "step": 2614 + }, + { + "epoch": 2.9011232838718626, + "grad_norm": 0.3340136706829071, + "learning_rate": 1.0855263157894737e-05, + "loss": 0.3379, + "step": 2615 + }, + { + "epoch": 2.902232700041603, + "grad_norm": 0.41469404101371765, + "learning_rate": 1.073190789473684e-05, + "loss": 0.4554, + "step": 2616 + }, + { + "epoch": 2.903342116211344, + "grad_norm": 0.34765610098838806, + "learning_rate": 1.0608552631578946e-05, + "loss": 0.287, + "step": 2617 + }, + { + "epoch": 2.9044515323810844, + "grad_norm": 0.3712421655654907, + "learning_rate": 1.0485197368421052e-05, + "loss": 0.3694, + "step": 2618 + }, + { + "epoch": 2.905560948550825, + "grad_norm": 0.37911391258239746, + "learning_rate": 1.0361842105263157e-05, + "loss": 0.3963, + "step": 2619 + }, + { + "epoch": 2.906670364720566, + "grad_norm": 0.3563274145126343, + "learning_rate": 1.0238486842105262e-05, + "loss": 0.4382, + "step": 2620 + }, + { + "epoch": 2.9077797808903068, + "grad_norm": 0.535892128944397, + "learning_rate": 1.0115131578947367e-05, + "loss": 0.4363, + "step": 2621 + }, + { + "epoch": 2.9088891970600472, + "grad_norm": 0.4795096218585968, + "learning_rate": 9.991776315789474e-06, + "loss": 0.5396, + "step": 2622 + }, + { + "epoch": 2.9099986132297877, + "grad_norm": 0.5538339614868164, + "learning_rate": 9.868421052631577e-06, + "loss": 0.3635, + "step": 2623 + }, + { + "epoch": 2.9111080293995286, + "grad_norm": 0.3773338198661804, + "learning_rate": 9.745065789473684e-06, + "loss": 0.4025, + "step": 2624 + }, + { + "epoch": 2.912217445569269, + "grad_norm": 0.6125039458274841, + "learning_rate": 9.621710526315789e-06, + "loss": 0.4946, + "step": 2625 + }, + { + "epoch": 2.91332686173901, + "grad_norm": 0.3754761219024658, + "learning_rate": 9.498355263157892e-06, + "loss": 0.3312, + "step": 2626 + }, + { + "epoch": 2.9144362779087505, + "grad_norm": 0.5250300168991089, + "learning_rate": 9.375e-06, + "loss": 0.4738, + "step": 2627 + }, + { + "epoch": 2.9155456940784914, + "grad_norm": 0.42315995693206787, + "learning_rate": 9.251644736842104e-06, + "loss": 0.3803, + "step": 2628 + }, + { + "epoch": 2.916655110248232, + "grad_norm": 0.5445030927658081, + "learning_rate": 9.12828947368421e-06, + "loss": 0.5786, + "step": 2629 + }, + { + "epoch": 2.9177645264179723, + "grad_norm": 0.6609618067741394, + "learning_rate": 9.004934210526314e-06, + "loss": 0.386, + "step": 2630 + }, + { + "epoch": 2.9188739425877133, + "grad_norm": 0.36153900623321533, + "learning_rate": 8.88157894736842e-06, + "loss": 0.3705, + "step": 2631 + }, + { + "epoch": 2.9199833587574537, + "grad_norm": 0.39644429087638855, + "learning_rate": 8.758223684210526e-06, + "loss": 0.4203, + "step": 2632 + }, + { + "epoch": 2.9210927749271947, + "grad_norm": 0.44478243589401245, + "learning_rate": 8.634868421052631e-06, + "loss": 0.5676, + "step": 2633 + }, + { + "epoch": 2.922202191096935, + "grad_norm": 0.3410852551460266, + "learning_rate": 8.511513157894736e-06, + "loss": 0.29, + "step": 2634 + }, + { + "epoch": 2.923311607266676, + "grad_norm": 0.38531428575515747, + "learning_rate": 8.388157894736841e-06, + "loss": 0.4727, + "step": 2635 + }, + { + "epoch": 2.9244210234364165, + "grad_norm": 0.40856409072875977, + "learning_rate": 8.264802631578946e-06, + "loss": 0.2994, + "step": 2636 + }, + { + "epoch": 2.925530439606157, + "grad_norm": 0.3600262999534607, + "learning_rate": 8.141447368421053e-06, + "loss": 0.3595, + "step": 2637 + }, + { + "epoch": 2.926639855775898, + "grad_norm": 0.4171159863471985, + "learning_rate": 8.018092105263158e-06, + "loss": 0.2841, + "step": 2638 + }, + { + "epoch": 2.927749271945639, + "grad_norm": 0.35529500246047974, + "learning_rate": 7.894736842105261e-06, + "loss": 0.278, + "step": 2639 + }, + { + "epoch": 2.9288586881153793, + "grad_norm": 0.3548045754432678, + "learning_rate": 7.771381578947368e-06, + "loss": 0.6584, + "step": 2640 + }, + { + "epoch": 2.92996810428512, + "grad_norm": 0.8075834512710571, + "learning_rate": 7.648026315789473e-06, + "loss": 0.2551, + "step": 2641 + }, + { + "epoch": 2.9310775204548607, + "grad_norm": 0.4155935049057007, + "learning_rate": 7.524671052631579e-06, + "loss": 0.3003, + "step": 2642 + }, + { + "epoch": 2.932186936624601, + "grad_norm": 0.4321109652519226, + "learning_rate": 7.401315789473684e-06, + "loss": 0.343, + "step": 2643 + }, + { + "epoch": 2.933296352794342, + "grad_norm": 0.3877204358577728, + "learning_rate": 7.277960526315789e-06, + "loss": 0.3259, + "step": 2644 + }, + { + "epoch": 2.9344057689640826, + "grad_norm": 0.30750367045402527, + "learning_rate": 7.154605263157895e-06, + "loss": 0.4062, + "step": 2645 + }, + { + "epoch": 2.9355151851338235, + "grad_norm": 0.6012676954269409, + "learning_rate": 7.03125e-06, + "loss": 0.4992, + "step": 2646 + }, + { + "epoch": 2.936624601303564, + "grad_norm": 0.3799775242805481, + "learning_rate": 6.907894736842104e-06, + "loss": 0.4411, + "step": 2647 + }, + { + "epoch": 2.9377340174733044, + "grad_norm": 0.49381592869758606, + "learning_rate": 6.78453947368421e-06, + "loss": 0.3694, + "step": 2648 + }, + { + "epoch": 2.9388434336430453, + "grad_norm": 0.7122082114219666, + "learning_rate": 6.661184210526315e-06, + "loss": 0.4398, + "step": 2649 + }, + { + "epoch": 2.9399528498127863, + "grad_norm": 0.48016512393951416, + "learning_rate": 6.53782894736842e-06, + "loss": 0.3111, + "step": 2650 + }, + { + "epoch": 2.9410622659825267, + "grad_norm": 0.449733167886734, + "learning_rate": 6.414473684210526e-06, + "loss": 0.3113, + "step": 2651 + }, + { + "epoch": 2.942171682152267, + "grad_norm": 0.33274686336517334, + "learning_rate": 6.291118421052631e-06, + "loss": 0.4131, + "step": 2652 + }, + { + "epoch": 2.943281098322008, + "grad_norm": 0.37493860721588135, + "learning_rate": 6.167763157894737e-06, + "loss": 0.2557, + "step": 2653 + }, + { + "epoch": 2.9443905144917486, + "grad_norm": 0.37679123878479004, + "learning_rate": 6.044407894736842e-06, + "loss": 0.4252, + "step": 2654 + }, + { + "epoch": 2.9454999306614895, + "grad_norm": 0.40936678647994995, + "learning_rate": 5.921052631578947e-06, + "loss": 0.4795, + "step": 2655 + }, + { + "epoch": 2.94660934683123, + "grad_norm": 0.4506048560142517, + "learning_rate": 5.797697368421053e-06, + "loss": 0.5137, + "step": 2656 + }, + { + "epoch": 2.947718763000971, + "grad_norm": 0.39844781160354614, + "learning_rate": 5.674342105263158e-06, + "loss": 0.3271, + "step": 2657 + }, + { + "epoch": 2.9488281791707114, + "grad_norm": 0.39184948801994324, + "learning_rate": 5.550986842105262e-06, + "loss": 0.3885, + "step": 2658 + }, + { + "epoch": 2.949937595340452, + "grad_norm": 0.3266217112541199, + "learning_rate": 5.4276315789473686e-06, + "loss": 0.3644, + "step": 2659 + }, + { + "epoch": 2.9510470115101928, + "grad_norm": 0.4527483284473419, + "learning_rate": 5.304276315789473e-06, + "loss": 0.3945, + "step": 2660 + }, + { + "epoch": 2.9521564276799337, + "grad_norm": 0.5377230644226074, + "learning_rate": 5.180921052631579e-06, + "loss": 0.458, + "step": 2661 + }, + { + "epoch": 2.953265843849674, + "grad_norm": 0.39653611183166504, + "learning_rate": 5.057565789473684e-06, + "loss": 0.3541, + "step": 2662 + }, + { + "epoch": 2.9543752600194146, + "grad_norm": 0.2904629409313202, + "learning_rate": 4.934210526315789e-06, + "loss": 0.1772, + "step": 2663 + }, + { + "epoch": 2.9554846761891556, + "grad_norm": 0.3323127329349518, + "learning_rate": 4.8108552631578946e-06, + "loss": 0.5255, + "step": 2664 + }, + { + "epoch": 2.956594092358896, + "grad_norm": 0.4561123847961426, + "learning_rate": 4.6875e-06, + "loss": 0.3503, + "step": 2665 + }, + { + "epoch": 2.957703508528637, + "grad_norm": 0.37283483147621155, + "learning_rate": 4.564144736842105e-06, + "loss": 0.4867, + "step": 2666 + }, + { + "epoch": 2.9588129246983774, + "grad_norm": 0.5536327362060547, + "learning_rate": 4.44078947368421e-06, + "loss": 0.3599, + "step": 2667 + }, + { + "epoch": 2.9599223408681183, + "grad_norm": 0.42849066853523254, + "learning_rate": 4.3174342105263155e-06, + "loss": 0.4078, + "step": 2668 + }, + { + "epoch": 2.961031757037859, + "grad_norm": 0.4770295023918152, + "learning_rate": 4.1940789473684206e-06, + "loss": 0.5948, + "step": 2669 + }, + { + "epoch": 2.9621411732075993, + "grad_norm": 0.4041629433631897, + "learning_rate": 4.0707236842105264e-06, + "loss": 0.3036, + "step": 2670 + }, + { + "epoch": 2.96325058937734, + "grad_norm": 0.4390498697757721, + "learning_rate": 3.947368421052631e-06, + "loss": 0.373, + "step": 2671 + }, + { + "epoch": 2.964360005547081, + "grad_norm": 0.31613367795944214, + "learning_rate": 3.8240131578947365e-06, + "loss": 0.3119, + "step": 2672 + }, + { + "epoch": 2.9654694217168216, + "grad_norm": 0.4405744969844818, + "learning_rate": 3.700657894736842e-06, + "loss": 0.4571, + "step": 2673 + }, + { + "epoch": 2.966578837886562, + "grad_norm": 0.4392789900302887, + "learning_rate": 3.5773026315789474e-06, + "loss": 0.4928, + "step": 2674 + }, + { + "epoch": 2.967688254056303, + "grad_norm": 0.4602966606616974, + "learning_rate": 3.453947368421052e-06, + "loss": 0.345, + "step": 2675 + }, + { + "epoch": 2.9687976702260435, + "grad_norm": 0.447457492351532, + "learning_rate": 3.3305921052631574e-06, + "loss": 0.4556, + "step": 2676 + }, + { + "epoch": 2.9699070863957844, + "grad_norm": 0.38211768865585327, + "learning_rate": 3.207236842105263e-06, + "loss": 0.4022, + "step": 2677 + }, + { + "epoch": 2.971016502565525, + "grad_norm": 0.35372307896614075, + "learning_rate": 3.0838815789473684e-06, + "loss": 0.3703, + "step": 2678 + }, + { + "epoch": 2.9721259187352658, + "grad_norm": 0.44302472472190857, + "learning_rate": 2.9605263157894734e-06, + "loss": 0.4514, + "step": 2679 + }, + { + "epoch": 2.9732353349050062, + "grad_norm": 0.5012083053588867, + "learning_rate": 2.837171052631579e-06, + "loss": 0.4311, + "step": 2680 + }, + { + "epoch": 2.9743447510747467, + "grad_norm": 0.4370708763599396, + "learning_rate": 2.7138157894736843e-06, + "loss": 0.3342, + "step": 2681 + }, + { + "epoch": 2.9754541672444876, + "grad_norm": 0.37920454144477844, + "learning_rate": 2.5904605263157893e-06, + "loss": 0.5212, + "step": 2682 + }, + { + "epoch": 2.976563583414228, + "grad_norm": 0.2939812242984772, + "learning_rate": 2.4671052631578943e-06, + "loss": 0.3169, + "step": 2683 + }, + { + "epoch": 2.977672999583969, + "grad_norm": 0.354949414730072, + "learning_rate": 2.34375e-06, + "loss": 0.2588, + "step": 2684 + }, + { + "epoch": 2.9787824157537095, + "grad_norm": 0.334835022687912, + "learning_rate": 2.220394736842105e-06, + "loss": 0.4058, + "step": 2685 + }, + { + "epoch": 2.9798918319234504, + "grad_norm": 0.4206099212169647, + "learning_rate": 2.0970394736842103e-06, + "loss": 0.3528, + "step": 2686 + }, + { + "epoch": 2.981001248093191, + "grad_norm": 0.4646962583065033, + "learning_rate": 1.9736842105263153e-06, + "loss": 0.5455, + "step": 2687 + }, + { + "epoch": 2.9821106642629314, + "grad_norm": 0.4603005647659302, + "learning_rate": 1.850328947368421e-06, + "loss": 0.3451, + "step": 2688 + }, + { + "epoch": 2.9832200804326723, + "grad_norm": 0.4108537435531616, + "learning_rate": 1.726973684210526e-06, + "loss": 0.5365, + "step": 2689 + }, + { + "epoch": 2.984329496602413, + "grad_norm": 0.43144112825393677, + "learning_rate": 1.6036184210526314e-06, + "loss": 0.3908, + "step": 2690 + }, + { + "epoch": 2.9854389127721537, + "grad_norm": 0.5529698133468628, + "learning_rate": 1.4802631578947367e-06, + "loss": 0.3784, + "step": 2691 + }, + { + "epoch": 2.986548328941894, + "grad_norm": 0.37649285793304443, + "learning_rate": 1.3569078947368421e-06, + "loss": 0.4073, + "step": 2692 + }, + { + "epoch": 2.987657745111635, + "grad_norm": 0.3004691004753113, + "learning_rate": 1.2335526315789472e-06, + "loss": 0.4305, + "step": 2693 + }, + { + "epoch": 2.9887671612813755, + "grad_norm": 0.4551496207714081, + "learning_rate": 1.1101973684210524e-06, + "loss": 0.3474, + "step": 2694 + }, + { + "epoch": 2.9898765774511165, + "grad_norm": 0.3873107135295868, + "learning_rate": 9.868421052631577e-07, + "loss": 0.3239, + "step": 2695 + }, + { + "epoch": 2.990985993620857, + "grad_norm": 0.483542263507843, + "learning_rate": 8.63486842105263e-07, + "loss": 0.3812, + "step": 2696 + }, + { + "epoch": 2.992095409790598, + "grad_norm": 0.484693318605423, + "learning_rate": 7.401315789473683e-07, + "loss": 0.4768, + "step": 2697 + }, + { + "epoch": 2.9932048259603383, + "grad_norm": 0.3889079988002777, + "learning_rate": 6.167763157894736e-07, + "loss": 0.5391, + "step": 2698 + }, + { + "epoch": 2.994314242130079, + "grad_norm": 0.39534711837768555, + "learning_rate": 4.934210526315788e-07, + "loss": 0.2693, + "step": 2699 + }, + { + "epoch": 2.9954236582998197, + "grad_norm": 0.28255271911621094, + "learning_rate": 3.7006578947368417e-07, + "loss": 0.496, + "step": 2700 + }, + { + "epoch": 2.9965330744695606, + "grad_norm": 0.39558374881744385, + "learning_rate": 2.467105263157894e-07, + "loss": 0.3213, + "step": 2701 + }, + { + "epoch": 2.997642490639301, + "grad_norm": 0.39010322093963623, + "learning_rate": 1.233552631578947e-07, + "loss": 0.3245, + "step": 2702 + }, + { + "epoch": 2.9987519068090416, + "grad_norm": 0.33332499861717224, + "learning_rate": 0.0, + "loss": 0.3557, + "step": 2703 + }, + { + "epoch": 2.9987519068090416, + "step": 2703, + "total_flos": 1.203022932864172e+18, + "train_loss": 0.49336877679696933, + "train_runtime": 13859.7262, + "train_samples_per_second": 1.561, + "train_steps_per_second": 0.195 + } + ], + "logging_steps": 1.0, + "max_steps": 2703, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.203022932864172e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}