diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,50512 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 7210, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006934812760055479, + "grad_norm": 0.7157525954676491, + "learning_rate": 1.3869625520110959e-08, + "loss": 0.9952, + "step": 1 + }, + { + "epoch": 0.0013869625520110957, + "grad_norm": 0.7365616469009997, + "learning_rate": 2.7739251040221917e-08, + "loss": 1.023, + "step": 2 + }, + { + "epoch": 0.0020804438280166435, + "grad_norm": 0.9607770173503413, + "learning_rate": 4.1608876560332874e-08, + "loss": 1.0586, + "step": 3 + }, + { + "epoch": 0.0027739251040221915, + "grad_norm": 0.693573066041827, + "learning_rate": 5.5478502080443834e-08, + "loss": 1.0157, + "step": 4 + }, + { + "epoch": 0.0034674063800277394, + "grad_norm": 0.8555541552251046, + "learning_rate": 6.93481276005548e-08, + "loss": 1.0226, + "step": 5 + }, + { + "epoch": 0.004160887656033287, + "grad_norm": 0.7857834335732481, + "learning_rate": 8.321775312066575e-08, + "loss": 1.1396, + "step": 6 + }, + { + "epoch": 0.0048543689320388345, + "grad_norm": 0.7637361805161516, + "learning_rate": 9.70873786407767e-08, + "loss": 1.0319, + "step": 7 + }, + { + "epoch": 0.005547850208044383, + "grad_norm": 0.8106483805629625, + "learning_rate": 1.1095700416088767e-07, + "loss": 1.0091, + "step": 8 + }, + { + "epoch": 0.0062413314840499305, + "grad_norm": 0.7027160641608973, + "learning_rate": 1.2482662968099862e-07, + "loss": 1.0236, + "step": 9 + }, + { + "epoch": 0.006934812760055479, + "grad_norm": 0.7601501023881959, + "learning_rate": 1.386962552011096e-07, + "loss": 1.0949, + "step": 10 + }, + { + "epoch": 0.0076282940360610264, + "grad_norm": 0.7194724240032185, + "learning_rate": 1.5256588072122053e-07, + "loss": 1.0223, + "step": 11 + }, + { + "epoch": 0.008321775312066574, + "grad_norm": 0.692926972724246, + "learning_rate": 1.664355062413315e-07, + "loss": 1.0643, + "step": 12 + }, + { + "epoch": 0.009015256588072122, + "grad_norm": 0.7059539310308902, + "learning_rate": 1.8030513176144244e-07, + "loss": 1.0264, + "step": 13 + }, + { + "epoch": 0.009708737864077669, + "grad_norm": 0.721973293622727, + "learning_rate": 1.941747572815534e-07, + "loss": 1.084, + "step": 14 + }, + { + "epoch": 0.010402219140083218, + "grad_norm": 0.7019103873678585, + "learning_rate": 2.080443828016644e-07, + "loss": 1.0181, + "step": 15 + }, + { + "epoch": 0.011095700416088766, + "grad_norm": 0.7399949683343555, + "learning_rate": 2.2191400832177534e-07, + "loss": 1.0885, + "step": 16 + }, + { + "epoch": 0.011789181692094313, + "grad_norm": 0.6470462981174262, + "learning_rate": 2.3578363384188628e-07, + "loss": 0.9954, + "step": 17 + }, + { + "epoch": 0.012482662968099861, + "grad_norm": 0.6664168439771029, + "learning_rate": 2.4965325936199724e-07, + "loss": 1.011, + "step": 18 + }, + { + "epoch": 0.013176144244105409, + "grad_norm": 0.7287961223313654, + "learning_rate": 2.635228848821082e-07, + "loss": 1.0184, + "step": 19 + }, + { + "epoch": 0.013869625520110958, + "grad_norm": 0.7288020368439174, + "learning_rate": 2.773925104022192e-07, + "loss": 1.1128, + "step": 20 + }, + { + "epoch": 0.014563106796116505, + "grad_norm": 0.72738295640996, + "learning_rate": 2.9126213592233014e-07, + "loss": 1.0416, + "step": 21 + }, + { + "epoch": 0.015256588072122053, + "grad_norm": 0.7148771515364, + "learning_rate": 3.0513176144244106e-07, + "loss": 1.0272, + "step": 22 + }, + { + "epoch": 0.0159500693481276, + "grad_norm": 0.733164316796413, + "learning_rate": 3.19001386962552e-07, + "loss": 1.0815, + "step": 23 + }, + { + "epoch": 0.016643550624133148, + "grad_norm": 0.7094878780073887, + "learning_rate": 3.32871012482663e-07, + "loss": 1.0639, + "step": 24 + }, + { + "epoch": 0.017337031900138695, + "grad_norm": 0.7432640656740158, + "learning_rate": 3.4674063800277396e-07, + "loss": 1.0609, + "step": 25 + }, + { + "epoch": 0.018030513176144243, + "grad_norm": 0.6591866553870518, + "learning_rate": 3.6061026352288487e-07, + "loss": 0.9867, + "step": 26 + }, + { + "epoch": 0.01872399445214979, + "grad_norm": 0.739928838539585, + "learning_rate": 3.7447988904299584e-07, + "loss": 1.0453, + "step": 27 + }, + { + "epoch": 0.019417475728155338, + "grad_norm": 0.6955722959021466, + "learning_rate": 3.883495145631068e-07, + "loss": 1.0749, + "step": 28 + }, + { + "epoch": 0.02011095700416089, + "grad_norm": 0.7262106028517052, + "learning_rate": 4.022191400832178e-07, + "loss": 1.0681, + "step": 29 + }, + { + "epoch": 0.020804438280166437, + "grad_norm": 0.8561322692646302, + "learning_rate": 4.160887656033288e-07, + "loss": 0.9954, + "step": 30 + }, + { + "epoch": 0.021497919556171984, + "grad_norm": 0.6664500793057903, + "learning_rate": 4.299583911234397e-07, + "loss": 0.9347, + "step": 31 + }, + { + "epoch": 0.022191400832177532, + "grad_norm": 0.6603222417367025, + "learning_rate": 4.4382801664355067e-07, + "loss": 0.9633, + "step": 32 + }, + { + "epoch": 0.02288488210818308, + "grad_norm": 0.6832516318019873, + "learning_rate": 4.5769764216366164e-07, + "loss": 1.0212, + "step": 33 + }, + { + "epoch": 0.023578363384188627, + "grad_norm": 0.6729259095802395, + "learning_rate": 4.7156726768377255e-07, + "loss": 1.0028, + "step": 34 + }, + { + "epoch": 0.024271844660194174, + "grad_norm": 0.6652382495219267, + "learning_rate": 4.854368932038835e-07, + "loss": 0.9346, + "step": 35 + }, + { + "epoch": 0.024965325936199722, + "grad_norm": 0.7912453925739037, + "learning_rate": 4.993065187239945e-07, + "loss": 1.0298, + "step": 36 + }, + { + "epoch": 0.02565880721220527, + "grad_norm": 0.6861473122536618, + "learning_rate": 5.131761442441055e-07, + "loss": 1.0371, + "step": 37 + }, + { + "epoch": 0.026352288488210817, + "grad_norm": 0.7424079199734541, + "learning_rate": 5.270457697642164e-07, + "loss": 1.0255, + "step": 38 + }, + { + "epoch": 0.027045769764216365, + "grad_norm": 0.6702315409020495, + "learning_rate": 5.409153952843274e-07, + "loss": 0.9796, + "step": 39 + }, + { + "epoch": 0.027739251040221916, + "grad_norm": 0.6783824210210769, + "learning_rate": 5.547850208044384e-07, + "loss": 0.999, + "step": 40 + }, + { + "epoch": 0.028432732316227463, + "grad_norm": 0.7489503428254928, + "learning_rate": 5.686546463245493e-07, + "loss": 1.0615, + "step": 41 + }, + { + "epoch": 0.02912621359223301, + "grad_norm": 0.6067754960950416, + "learning_rate": 5.825242718446603e-07, + "loss": 0.937, + "step": 42 + }, + { + "epoch": 0.029819694868238558, + "grad_norm": 0.6626081964303163, + "learning_rate": 5.963938973647713e-07, + "loss": 0.9696, + "step": 43 + }, + { + "epoch": 0.030513176144244106, + "grad_norm": 0.7780657091944089, + "learning_rate": 6.102635228848821e-07, + "loss": 1.0647, + "step": 44 + }, + { + "epoch": 0.031206657420249653, + "grad_norm": 0.7294569731371983, + "learning_rate": 6.241331484049931e-07, + "loss": 1.0183, + "step": 45 + }, + { + "epoch": 0.0319001386962552, + "grad_norm": 0.6815932670137421, + "learning_rate": 6.38002773925104e-07, + "loss": 1.0248, + "step": 46 + }, + { + "epoch": 0.03259361997226075, + "grad_norm": 0.6449349915814664, + "learning_rate": 6.51872399445215e-07, + "loss": 0.9586, + "step": 47 + }, + { + "epoch": 0.033287101248266296, + "grad_norm": 0.6652671386978704, + "learning_rate": 6.65742024965326e-07, + "loss": 1.0394, + "step": 48 + }, + { + "epoch": 0.03398058252427184, + "grad_norm": 0.6877408009142694, + "learning_rate": 6.79611650485437e-07, + "loss": 1.0005, + "step": 49 + }, + { + "epoch": 0.03467406380027739, + "grad_norm": 0.6882709742577532, + "learning_rate": 6.934812760055479e-07, + "loss": 1.0294, + "step": 50 + }, + { + "epoch": 0.03536754507628294, + "grad_norm": 0.6902783516628763, + "learning_rate": 7.073509015256588e-07, + "loss": 0.9628, + "step": 51 + }, + { + "epoch": 0.036061026352288486, + "grad_norm": 0.6488863656818254, + "learning_rate": 7.212205270457697e-07, + "loss": 1.046, + "step": 52 + }, + { + "epoch": 0.036754507628294034, + "grad_norm": 0.7047050783633263, + "learning_rate": 7.350901525658807e-07, + "loss": 1.0596, + "step": 53 + }, + { + "epoch": 0.03744798890429958, + "grad_norm": 0.6241294357822871, + "learning_rate": 7.489597780859917e-07, + "loss": 0.9602, + "step": 54 + }, + { + "epoch": 0.03814147018030513, + "grad_norm": 0.632282875996663, + "learning_rate": 7.628294036061026e-07, + "loss": 0.9502, + "step": 55 + }, + { + "epoch": 0.038834951456310676, + "grad_norm": 0.7755548632530835, + "learning_rate": 7.766990291262136e-07, + "loss": 1.0019, + "step": 56 + }, + { + "epoch": 0.03952843273231623, + "grad_norm": 0.6816306677937195, + "learning_rate": 7.905686546463247e-07, + "loss": 1.0584, + "step": 57 + }, + { + "epoch": 0.04022191400832178, + "grad_norm": 0.6869667996670505, + "learning_rate": 8.044382801664357e-07, + "loss": 1.0636, + "step": 58 + }, + { + "epoch": 0.040915395284327326, + "grad_norm": 0.6844077521420684, + "learning_rate": 8.183079056865466e-07, + "loss": 1.0049, + "step": 59 + }, + { + "epoch": 0.04160887656033287, + "grad_norm": 0.7295788670307389, + "learning_rate": 8.321775312066576e-07, + "loss": 0.9699, + "step": 60 + }, + { + "epoch": 0.04230235783633842, + "grad_norm": 0.6344974379462622, + "learning_rate": 8.460471567267684e-07, + "loss": 0.981, + "step": 61 + }, + { + "epoch": 0.04299583911234397, + "grad_norm": 0.6601574198153054, + "learning_rate": 8.599167822468794e-07, + "loss": 1.0688, + "step": 62 + }, + { + "epoch": 0.043689320388349516, + "grad_norm": 0.6590855794771459, + "learning_rate": 8.737864077669904e-07, + "loss": 0.9909, + "step": 63 + }, + { + "epoch": 0.044382801664355064, + "grad_norm": 0.6588222087234297, + "learning_rate": 8.876560332871013e-07, + "loss": 1.0405, + "step": 64 + }, + { + "epoch": 0.04507628294036061, + "grad_norm": 0.670816840796301, + "learning_rate": 9.015256588072123e-07, + "loss": 1.0596, + "step": 65 + }, + { + "epoch": 0.04576976421636616, + "grad_norm": 0.7457681485301779, + "learning_rate": 9.153952843273233e-07, + "loss": 1.058, + "step": 66 + }, + { + "epoch": 0.046463245492371706, + "grad_norm": 0.6730150097176245, + "learning_rate": 9.292649098474342e-07, + "loss": 1.0267, + "step": 67 + }, + { + "epoch": 0.047156726768377254, + "grad_norm": 0.6816232648360951, + "learning_rate": 9.431345353675451e-07, + "loss": 1.0179, + "step": 68 + }, + { + "epoch": 0.0478502080443828, + "grad_norm": 0.6806863554772085, + "learning_rate": 9.570041608876562e-07, + "loss": 1.04, + "step": 69 + }, + { + "epoch": 0.04854368932038835, + "grad_norm": 0.6722904703306192, + "learning_rate": 9.70873786407767e-07, + "loss": 1.0269, + "step": 70 + }, + { + "epoch": 0.049237170596393896, + "grad_norm": 0.6307523759370202, + "learning_rate": 9.84743411927878e-07, + "loss": 0.963, + "step": 71 + }, + { + "epoch": 0.049930651872399444, + "grad_norm": 0.690452055960613, + "learning_rate": 9.98613037447989e-07, + "loss": 0.9781, + "step": 72 + }, + { + "epoch": 0.05062413314840499, + "grad_norm": 0.6280269384455514, + "learning_rate": 1.0124826629680998e-06, + "loss": 0.9447, + "step": 73 + }, + { + "epoch": 0.05131761442441054, + "grad_norm": 0.6192366709339232, + "learning_rate": 1.026352288488211e-06, + "loss": 0.9516, + "step": 74 + }, + { + "epoch": 0.052011095700416086, + "grad_norm": 0.6582915392388792, + "learning_rate": 1.0402219140083218e-06, + "loss": 0.9755, + "step": 75 + }, + { + "epoch": 0.052704576976421634, + "grad_norm": 0.6247740559571989, + "learning_rate": 1.0540915395284328e-06, + "loss": 0.9531, + "step": 76 + }, + { + "epoch": 0.05339805825242718, + "grad_norm": 0.8715757920926887, + "learning_rate": 1.0679611650485437e-06, + "loss": 0.9983, + "step": 77 + }, + { + "epoch": 0.05409153952843273, + "grad_norm": 0.6230904855613747, + "learning_rate": 1.0818307905686548e-06, + "loss": 0.953, + "step": 78 + }, + { + "epoch": 0.054785020804438284, + "grad_norm": 0.6999829088478414, + "learning_rate": 1.0957004160887658e-06, + "loss": 0.9008, + "step": 79 + }, + { + "epoch": 0.05547850208044383, + "grad_norm": 0.652430833328779, + "learning_rate": 1.1095700416088767e-06, + "loss": 0.9974, + "step": 80 + }, + { + "epoch": 0.05617198335644938, + "grad_norm": 0.6423534831220379, + "learning_rate": 1.1234396671289876e-06, + "loss": 0.9604, + "step": 81 + }, + { + "epoch": 0.056865464632454926, + "grad_norm": 0.6038815891519935, + "learning_rate": 1.1373092926490986e-06, + "loss": 1.0135, + "step": 82 + }, + { + "epoch": 0.057558945908460474, + "grad_norm": 0.6539199107847871, + "learning_rate": 1.1511789181692095e-06, + "loss": 0.9841, + "step": 83 + }, + { + "epoch": 0.05825242718446602, + "grad_norm": 0.624314333182806, + "learning_rate": 1.1650485436893206e-06, + "loss": 1.0102, + "step": 84 + }, + { + "epoch": 0.05894590846047157, + "grad_norm": 0.6458015688268974, + "learning_rate": 1.1789181692094314e-06, + "loss": 0.9964, + "step": 85 + }, + { + "epoch": 0.059639389736477116, + "grad_norm": 0.6443432282203443, + "learning_rate": 1.1927877947295425e-06, + "loss": 0.9278, + "step": 86 + }, + { + "epoch": 0.060332871012482664, + "grad_norm": 0.6562209616139189, + "learning_rate": 1.2066574202496534e-06, + "loss": 0.9583, + "step": 87 + }, + { + "epoch": 0.06102635228848821, + "grad_norm": 0.5924884335147939, + "learning_rate": 1.2205270457697642e-06, + "loss": 0.9561, + "step": 88 + }, + { + "epoch": 0.06171983356449376, + "grad_norm": 0.6744890114343025, + "learning_rate": 1.2343966712898753e-06, + "loss": 0.9438, + "step": 89 + }, + { + "epoch": 0.06241331484049931, + "grad_norm": 0.6378026335633673, + "learning_rate": 1.2482662968099862e-06, + "loss": 1.007, + "step": 90 + }, + { + "epoch": 0.06310679611650485, + "grad_norm": 0.597100408771657, + "learning_rate": 1.2621359223300972e-06, + "loss": 1.0074, + "step": 91 + }, + { + "epoch": 0.0638002773925104, + "grad_norm": 0.6227995100866369, + "learning_rate": 1.276005547850208e-06, + "loss": 0.9649, + "step": 92 + }, + { + "epoch": 0.06449375866851595, + "grad_norm": 0.5989746070664526, + "learning_rate": 1.2898751733703192e-06, + "loss": 0.9215, + "step": 93 + }, + { + "epoch": 0.0651872399445215, + "grad_norm": 0.6702674731080163, + "learning_rate": 1.30374479889043e-06, + "loss": 0.9505, + "step": 94 + }, + { + "epoch": 0.06588072122052704, + "grad_norm": 0.7085493524866154, + "learning_rate": 1.3176144244105409e-06, + "loss": 0.9768, + "step": 95 + }, + { + "epoch": 0.06657420249653259, + "grad_norm": 0.7221883081125735, + "learning_rate": 1.331484049930652e-06, + "loss": 0.9901, + "step": 96 + }, + { + "epoch": 0.06726768377253814, + "grad_norm": 0.5582461615156555, + "learning_rate": 1.3453536754507628e-06, + "loss": 0.9002, + "step": 97 + }, + { + "epoch": 0.06796116504854369, + "grad_norm": 0.708061679173971, + "learning_rate": 1.359223300970874e-06, + "loss": 0.9588, + "step": 98 + }, + { + "epoch": 0.06865464632454923, + "grad_norm": 0.622931134839384, + "learning_rate": 1.3730929264909848e-06, + "loss": 0.9352, + "step": 99 + }, + { + "epoch": 0.06934812760055478, + "grad_norm": 0.5849387470895084, + "learning_rate": 1.3869625520110958e-06, + "loss": 0.9115, + "step": 100 + }, + { + "epoch": 0.07004160887656033, + "grad_norm": 0.5523372708912899, + "learning_rate": 1.4008321775312067e-06, + "loss": 0.9002, + "step": 101 + }, + { + "epoch": 0.07073509015256588, + "grad_norm": 0.5442657802829106, + "learning_rate": 1.4147018030513176e-06, + "loss": 0.9409, + "step": 102 + }, + { + "epoch": 0.07142857142857142, + "grad_norm": 0.5167094979882029, + "learning_rate": 1.4285714285714286e-06, + "loss": 0.8774, + "step": 103 + }, + { + "epoch": 0.07212205270457697, + "grad_norm": 0.4852800648235046, + "learning_rate": 1.4424410540915395e-06, + "loss": 0.8487, + "step": 104 + }, + { + "epoch": 0.07281553398058252, + "grad_norm": 0.48008720213910333, + "learning_rate": 1.4563106796116506e-06, + "loss": 0.8546, + "step": 105 + }, + { + "epoch": 0.07350901525658807, + "grad_norm": 0.5381970957355515, + "learning_rate": 1.4701803051317614e-06, + "loss": 0.9464, + "step": 106 + }, + { + "epoch": 0.07420249653259361, + "grad_norm": 0.5183008486942954, + "learning_rate": 1.4840499306518725e-06, + "loss": 1.002, + "step": 107 + }, + { + "epoch": 0.07489597780859916, + "grad_norm": 0.45637453449028764, + "learning_rate": 1.4979195561719834e-06, + "loss": 0.8192, + "step": 108 + }, + { + "epoch": 0.07558945908460471, + "grad_norm": 0.4881631488510109, + "learning_rate": 1.5117891816920942e-06, + "loss": 0.8049, + "step": 109 + }, + { + "epoch": 0.07628294036061026, + "grad_norm": 0.46223944342917794, + "learning_rate": 1.5256588072122053e-06, + "loss": 0.8434, + "step": 110 + }, + { + "epoch": 0.0769764216366158, + "grad_norm": 0.462576284785379, + "learning_rate": 1.5395284327323162e-06, + "loss": 0.9045, + "step": 111 + }, + { + "epoch": 0.07766990291262135, + "grad_norm": 0.4620343105037817, + "learning_rate": 1.5533980582524272e-06, + "loss": 0.8832, + "step": 112 + }, + { + "epoch": 0.07836338418862691, + "grad_norm": 0.4863508425853165, + "learning_rate": 1.5672676837725385e-06, + "loss": 0.8998, + "step": 113 + }, + { + "epoch": 0.07905686546463246, + "grad_norm": 0.4775428016819076, + "learning_rate": 1.5811373092926494e-06, + "loss": 0.898, + "step": 114 + }, + { + "epoch": 0.07975034674063801, + "grad_norm": 0.4830885527898184, + "learning_rate": 1.5950069348127602e-06, + "loss": 0.9021, + "step": 115 + }, + { + "epoch": 0.08044382801664356, + "grad_norm": 0.4869441989034695, + "learning_rate": 1.6088765603328713e-06, + "loss": 0.829, + "step": 116 + }, + { + "epoch": 0.0811373092926491, + "grad_norm": 0.49452604561754526, + "learning_rate": 1.6227461858529822e-06, + "loss": 0.9164, + "step": 117 + }, + { + "epoch": 0.08183079056865465, + "grad_norm": 0.4929690520607417, + "learning_rate": 1.6366158113730932e-06, + "loss": 0.9656, + "step": 118 + }, + { + "epoch": 0.0825242718446602, + "grad_norm": 0.4534570660489125, + "learning_rate": 1.650485436893204e-06, + "loss": 0.8346, + "step": 119 + }, + { + "epoch": 0.08321775312066575, + "grad_norm": 0.44896814698879267, + "learning_rate": 1.6643550624133152e-06, + "loss": 0.841, + "step": 120 + }, + { + "epoch": 0.0839112343966713, + "grad_norm": 0.49925693078485434, + "learning_rate": 1.678224687933426e-06, + "loss": 0.8941, + "step": 121 + }, + { + "epoch": 0.08460471567267684, + "grad_norm": 0.40632859239809527, + "learning_rate": 1.6920943134535369e-06, + "loss": 0.8305, + "step": 122 + }, + { + "epoch": 0.08529819694868239, + "grad_norm": 0.45585401202974063, + "learning_rate": 1.705963938973648e-06, + "loss": 0.8915, + "step": 123 + }, + { + "epoch": 0.08599167822468794, + "grad_norm": 0.5016058318337435, + "learning_rate": 1.7198335644937588e-06, + "loss": 0.8585, + "step": 124 + }, + { + "epoch": 0.08668515950069348, + "grad_norm": 0.44681488446956685, + "learning_rate": 1.73370319001387e-06, + "loss": 0.8212, + "step": 125 + }, + { + "epoch": 0.08737864077669903, + "grad_norm": 0.4294515784157133, + "learning_rate": 1.7475728155339808e-06, + "loss": 0.897, + "step": 126 + }, + { + "epoch": 0.08807212205270458, + "grad_norm": 0.46211389282086196, + "learning_rate": 1.7614424410540918e-06, + "loss": 0.9046, + "step": 127 + }, + { + "epoch": 0.08876560332871013, + "grad_norm": 0.47423509856518187, + "learning_rate": 1.7753120665742027e-06, + "loss": 0.958, + "step": 128 + }, + { + "epoch": 0.08945908460471567, + "grad_norm": 0.40273149375453604, + "learning_rate": 1.7891816920943136e-06, + "loss": 0.7942, + "step": 129 + }, + { + "epoch": 0.09015256588072122, + "grad_norm": 0.46600366567800233, + "learning_rate": 1.8030513176144246e-06, + "loss": 0.8167, + "step": 130 + }, + { + "epoch": 0.09084604715672677, + "grad_norm": 0.4927096876158672, + "learning_rate": 1.8169209431345355e-06, + "loss": 0.8075, + "step": 131 + }, + { + "epoch": 0.09153952843273232, + "grad_norm": 0.4235794052816225, + "learning_rate": 1.8307905686546466e-06, + "loss": 0.8225, + "step": 132 + }, + { + "epoch": 0.09223300970873786, + "grad_norm": 0.42089854766081586, + "learning_rate": 1.8446601941747574e-06, + "loss": 0.8454, + "step": 133 + }, + { + "epoch": 0.09292649098474341, + "grad_norm": 0.6510274398754528, + "learning_rate": 1.8585298196948685e-06, + "loss": 0.9127, + "step": 134 + }, + { + "epoch": 0.09361997226074896, + "grad_norm": 0.41182494622092, + "learning_rate": 1.8723994452149794e-06, + "loss": 0.799, + "step": 135 + }, + { + "epoch": 0.09431345353675451, + "grad_norm": 0.40906146194426646, + "learning_rate": 1.8862690707350902e-06, + "loss": 0.8329, + "step": 136 + }, + { + "epoch": 0.09500693481276005, + "grad_norm": 0.417522501030148, + "learning_rate": 1.9001386962552013e-06, + "loss": 0.7889, + "step": 137 + }, + { + "epoch": 0.0957004160887656, + "grad_norm": 0.4760702200066777, + "learning_rate": 1.9140083217753124e-06, + "loss": 0.9148, + "step": 138 + }, + { + "epoch": 0.09639389736477115, + "grad_norm": 0.4409560943629456, + "learning_rate": 1.927877947295423e-06, + "loss": 0.841, + "step": 139 + }, + { + "epoch": 0.0970873786407767, + "grad_norm": 0.4176130976832559, + "learning_rate": 1.941747572815534e-06, + "loss": 0.8701, + "step": 140 + }, + { + "epoch": 0.09778085991678225, + "grad_norm": 0.6424846457242896, + "learning_rate": 1.955617198335645e-06, + "loss": 0.8193, + "step": 141 + }, + { + "epoch": 0.09847434119278779, + "grad_norm": 0.4642123896065429, + "learning_rate": 1.969486823855756e-06, + "loss": 0.9056, + "step": 142 + }, + { + "epoch": 0.09916782246879334, + "grad_norm": 0.4614601263209169, + "learning_rate": 1.983356449375867e-06, + "loss": 0.8136, + "step": 143 + }, + { + "epoch": 0.09986130374479889, + "grad_norm": 0.386926521895237, + "learning_rate": 1.997226074895978e-06, + "loss": 0.7662, + "step": 144 + }, + { + "epoch": 0.10055478502080444, + "grad_norm": 0.4323327978431683, + "learning_rate": 2.011095700416089e-06, + "loss": 0.7478, + "step": 145 + }, + { + "epoch": 0.10124826629680998, + "grad_norm": 0.4048711893378798, + "learning_rate": 2.0249653259361997e-06, + "loss": 0.8134, + "step": 146 + }, + { + "epoch": 0.10194174757281553, + "grad_norm": 0.38406656989226945, + "learning_rate": 2.0388349514563107e-06, + "loss": 0.8047, + "step": 147 + }, + { + "epoch": 0.10263522884882108, + "grad_norm": 0.5988509236999207, + "learning_rate": 2.052704576976422e-06, + "loss": 0.8223, + "step": 148 + }, + { + "epoch": 0.10332871012482663, + "grad_norm": 0.4361669841858213, + "learning_rate": 2.066574202496533e-06, + "loss": 0.8688, + "step": 149 + }, + { + "epoch": 0.10402219140083217, + "grad_norm": 0.3824904511899543, + "learning_rate": 2.0804438280166435e-06, + "loss": 0.8234, + "step": 150 + }, + { + "epoch": 0.10471567267683772, + "grad_norm": 0.4737732666224156, + "learning_rate": 2.0943134535367546e-06, + "loss": 0.8359, + "step": 151 + }, + { + "epoch": 0.10540915395284327, + "grad_norm": 0.3686095534705079, + "learning_rate": 2.1081830790568657e-06, + "loss": 0.7598, + "step": 152 + }, + { + "epoch": 0.10610263522884882, + "grad_norm": 0.40975105551135427, + "learning_rate": 2.1220527045769763e-06, + "loss": 0.7943, + "step": 153 + }, + { + "epoch": 0.10679611650485436, + "grad_norm": 0.40738122820413203, + "learning_rate": 2.1359223300970874e-06, + "loss": 0.8312, + "step": 154 + }, + { + "epoch": 0.10748959778085991, + "grad_norm": 0.41387940078095947, + "learning_rate": 2.1497919556171985e-06, + "loss": 0.8691, + "step": 155 + }, + { + "epoch": 0.10818307905686546, + "grad_norm": 0.7368764876690216, + "learning_rate": 2.1636615811373096e-06, + "loss": 0.8323, + "step": 156 + }, + { + "epoch": 0.108876560332871, + "grad_norm": 0.5345760890309688, + "learning_rate": 2.17753120665742e-06, + "loss": 0.9664, + "step": 157 + }, + { + "epoch": 0.10957004160887657, + "grad_norm": 0.4131720622284066, + "learning_rate": 2.1914008321775317e-06, + "loss": 0.7368, + "step": 158 + }, + { + "epoch": 0.11026352288488211, + "grad_norm": 0.37733498699117274, + "learning_rate": 2.2052704576976423e-06, + "loss": 0.7489, + "step": 159 + }, + { + "epoch": 0.11095700416088766, + "grad_norm": 0.37270222722580476, + "learning_rate": 2.2191400832177534e-06, + "loss": 0.8096, + "step": 160 + }, + { + "epoch": 0.11165048543689321, + "grad_norm": 0.4058821981599631, + "learning_rate": 2.2330097087378645e-06, + "loss": 0.7868, + "step": 161 + }, + { + "epoch": 0.11234396671289876, + "grad_norm": 0.37964105090858946, + "learning_rate": 2.246879334257975e-06, + "loss": 0.7857, + "step": 162 + }, + { + "epoch": 0.1130374479889043, + "grad_norm": 0.4115195795365264, + "learning_rate": 2.2607489597780862e-06, + "loss": 0.7654, + "step": 163 + }, + { + "epoch": 0.11373092926490985, + "grad_norm": 0.45218882365511776, + "learning_rate": 2.2746185852981973e-06, + "loss": 0.8285, + "step": 164 + }, + { + "epoch": 0.1144244105409154, + "grad_norm": 0.4283307454414197, + "learning_rate": 2.2884882108183084e-06, + "loss": 0.8012, + "step": 165 + }, + { + "epoch": 0.11511789181692095, + "grad_norm": 0.37898471377966636, + "learning_rate": 2.302357836338419e-06, + "loss": 0.758, + "step": 166 + }, + { + "epoch": 0.1158113730929265, + "grad_norm": 0.4050126785782291, + "learning_rate": 2.31622746185853e-06, + "loss": 0.8699, + "step": 167 + }, + { + "epoch": 0.11650485436893204, + "grad_norm": 0.4475975205170183, + "learning_rate": 2.330097087378641e-06, + "loss": 0.7779, + "step": 168 + }, + { + "epoch": 0.11719833564493759, + "grad_norm": 0.4089845343812099, + "learning_rate": 2.343966712898752e-06, + "loss": 0.6968, + "step": 169 + }, + { + "epoch": 0.11789181692094314, + "grad_norm": 0.40249580215901204, + "learning_rate": 2.357836338418863e-06, + "loss": 0.7987, + "step": 170 + }, + { + "epoch": 0.11858529819694869, + "grad_norm": 0.42775199041769624, + "learning_rate": 2.371705963938974e-06, + "loss": 0.7305, + "step": 171 + }, + { + "epoch": 0.11927877947295423, + "grad_norm": 0.4017014413031526, + "learning_rate": 2.385575589459085e-06, + "loss": 0.7023, + "step": 172 + }, + { + "epoch": 0.11997226074895978, + "grad_norm": 0.36589722717314527, + "learning_rate": 2.3994452149791957e-06, + "loss": 0.7588, + "step": 173 + }, + { + "epoch": 0.12066574202496533, + "grad_norm": 0.6159883968990059, + "learning_rate": 2.4133148404993067e-06, + "loss": 0.7644, + "step": 174 + }, + { + "epoch": 0.12135922330097088, + "grad_norm": 0.370659678569985, + "learning_rate": 2.427184466019418e-06, + "loss": 0.7615, + "step": 175 + }, + { + "epoch": 0.12205270457697642, + "grad_norm": 0.42840548038986814, + "learning_rate": 2.4410540915395285e-06, + "loss": 0.7462, + "step": 176 + }, + { + "epoch": 0.12274618585298197, + "grad_norm": 0.38134854059108925, + "learning_rate": 2.4549237170596395e-06, + "loss": 0.8004, + "step": 177 + }, + { + "epoch": 0.12343966712898752, + "grad_norm": 0.44181385340495555, + "learning_rate": 2.4687933425797506e-06, + "loss": 0.7343, + "step": 178 + }, + { + "epoch": 0.12413314840499307, + "grad_norm": 0.4224670939920034, + "learning_rate": 2.4826629680998617e-06, + "loss": 0.7752, + "step": 179 + }, + { + "epoch": 0.12482662968099861, + "grad_norm": 0.36104530028643944, + "learning_rate": 2.4965325936199723e-06, + "loss": 0.7381, + "step": 180 + }, + { + "epoch": 0.12552011095700416, + "grad_norm": 0.44005429051414174, + "learning_rate": 2.5104022191400834e-06, + "loss": 0.7387, + "step": 181 + }, + { + "epoch": 0.1262135922330097, + "grad_norm": 0.5202060042479089, + "learning_rate": 2.5242718446601945e-06, + "loss": 0.8152, + "step": 182 + }, + { + "epoch": 0.12690707350901526, + "grad_norm": 0.3859967108361711, + "learning_rate": 2.538141470180305e-06, + "loss": 0.7155, + "step": 183 + }, + { + "epoch": 0.1276005547850208, + "grad_norm": 0.35556697652262786, + "learning_rate": 2.552011095700416e-06, + "loss": 0.7453, + "step": 184 + }, + { + "epoch": 0.12829403606102635, + "grad_norm": 0.440562926518887, + "learning_rate": 2.5658807212205273e-06, + "loss": 0.8335, + "step": 185 + }, + { + "epoch": 0.1289875173370319, + "grad_norm": 0.41641306148148105, + "learning_rate": 2.5797503467406383e-06, + "loss": 0.8509, + "step": 186 + }, + { + "epoch": 0.12968099861303745, + "grad_norm": 0.43837368942438204, + "learning_rate": 2.593619972260749e-06, + "loss": 0.7717, + "step": 187 + }, + { + "epoch": 0.130374479889043, + "grad_norm": 0.42195900363377553, + "learning_rate": 2.60748959778086e-06, + "loss": 0.7189, + "step": 188 + }, + { + "epoch": 0.13106796116504854, + "grad_norm": 0.40185288621909915, + "learning_rate": 2.621359223300971e-06, + "loss": 0.7455, + "step": 189 + }, + { + "epoch": 0.1317614424410541, + "grad_norm": 0.42981098774319276, + "learning_rate": 2.6352288488210818e-06, + "loss": 0.7765, + "step": 190 + }, + { + "epoch": 0.13245492371705964, + "grad_norm": 0.41750835261774816, + "learning_rate": 2.649098474341193e-06, + "loss": 0.7922, + "step": 191 + }, + { + "epoch": 0.13314840499306518, + "grad_norm": 0.4529181884791232, + "learning_rate": 2.662968099861304e-06, + "loss": 0.8118, + "step": 192 + }, + { + "epoch": 0.13384188626907073, + "grad_norm": 0.4403920855769716, + "learning_rate": 2.676837725381415e-06, + "loss": 0.6822, + "step": 193 + }, + { + "epoch": 0.13453536754507628, + "grad_norm": 0.38722274562520886, + "learning_rate": 2.6907073509015257e-06, + "loss": 0.7246, + "step": 194 + }, + { + "epoch": 0.13522884882108183, + "grad_norm": 0.4714787915536026, + "learning_rate": 2.7045769764216367e-06, + "loss": 0.7957, + "step": 195 + }, + { + "epoch": 0.13592233009708737, + "grad_norm": 0.42150957107657083, + "learning_rate": 2.718446601941748e-06, + "loss": 0.7434, + "step": 196 + }, + { + "epoch": 0.13661581137309292, + "grad_norm": 0.5216718201159719, + "learning_rate": 2.7323162274618584e-06, + "loss": 0.7357, + "step": 197 + }, + { + "epoch": 0.13730929264909847, + "grad_norm": 0.3878172010520359, + "learning_rate": 2.7461858529819695e-06, + "loss": 0.7129, + "step": 198 + }, + { + "epoch": 0.13800277392510402, + "grad_norm": 0.47665747018323956, + "learning_rate": 2.7600554785020806e-06, + "loss": 0.7987, + "step": 199 + }, + { + "epoch": 0.13869625520110956, + "grad_norm": 0.4128992259952167, + "learning_rate": 2.7739251040221917e-06, + "loss": 0.7241, + "step": 200 + }, + { + "epoch": 0.1393897364771151, + "grad_norm": 0.4396050055925165, + "learning_rate": 2.7877947295423023e-06, + "loss": 0.797, + "step": 201 + }, + { + "epoch": 0.14008321775312066, + "grad_norm": 0.4004272257193986, + "learning_rate": 2.8016643550624134e-06, + "loss": 0.7595, + "step": 202 + }, + { + "epoch": 0.1407766990291262, + "grad_norm": 0.3710027468305455, + "learning_rate": 2.8155339805825245e-06, + "loss": 0.7712, + "step": 203 + }, + { + "epoch": 0.14147018030513175, + "grad_norm": 0.3844728424521066, + "learning_rate": 2.829403606102635e-06, + "loss": 0.8086, + "step": 204 + }, + { + "epoch": 0.1421636615811373, + "grad_norm": 0.4119582628671637, + "learning_rate": 2.843273231622746e-06, + "loss": 0.6663, + "step": 205 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 0.4573212518732153, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.8165, + "step": 206 + }, + { + "epoch": 0.1435506241331484, + "grad_norm": 0.4050016525913954, + "learning_rate": 2.8710124826629683e-06, + "loss": 0.7374, + "step": 207 + }, + { + "epoch": 0.14424410540915394, + "grad_norm": 0.40466688502984693, + "learning_rate": 2.884882108183079e-06, + "loss": 0.7314, + "step": 208 + }, + { + "epoch": 0.1449375866851595, + "grad_norm": 0.4178896117001944, + "learning_rate": 2.89875173370319e-06, + "loss": 0.7978, + "step": 209 + }, + { + "epoch": 0.14563106796116504, + "grad_norm": 0.3787764219509587, + "learning_rate": 2.912621359223301e-06, + "loss": 0.7783, + "step": 210 + }, + { + "epoch": 0.1463245492371706, + "grad_norm": 0.45474506304874823, + "learning_rate": 2.9264909847434118e-06, + "loss": 0.6931, + "step": 211 + }, + { + "epoch": 0.14701803051317613, + "grad_norm": 0.4671029494611205, + "learning_rate": 2.940360610263523e-06, + "loss": 0.7796, + "step": 212 + }, + { + "epoch": 0.14771151178918168, + "grad_norm": 0.45047096334063835, + "learning_rate": 2.954230235783634e-06, + "loss": 0.8387, + "step": 213 + }, + { + "epoch": 0.14840499306518723, + "grad_norm": 0.38803404878012426, + "learning_rate": 2.968099861303745e-06, + "loss": 0.6888, + "step": 214 + }, + { + "epoch": 0.14909847434119278, + "grad_norm": 0.3943536793994258, + "learning_rate": 2.9819694868238556e-06, + "loss": 0.7059, + "step": 215 + }, + { + "epoch": 0.14979195561719832, + "grad_norm": 0.40804727263899615, + "learning_rate": 2.9958391123439667e-06, + "loss": 0.7399, + "step": 216 + }, + { + "epoch": 0.15048543689320387, + "grad_norm": 0.4103653828487261, + "learning_rate": 3.0097087378640778e-06, + "loss": 0.7439, + "step": 217 + }, + { + "epoch": 0.15117891816920942, + "grad_norm": 0.4695251055502895, + "learning_rate": 3.0235783633841884e-06, + "loss": 0.7459, + "step": 218 + }, + { + "epoch": 0.15187239944521497, + "grad_norm": 0.36981358397849723, + "learning_rate": 3.0374479889042995e-06, + "loss": 0.6762, + "step": 219 + }, + { + "epoch": 0.15256588072122051, + "grad_norm": 0.45866020164912186, + "learning_rate": 3.0513176144244106e-06, + "loss": 0.697, + "step": 220 + }, + { + "epoch": 0.15325936199722606, + "grad_norm": 0.4250655406061998, + "learning_rate": 3.0651872399445217e-06, + "loss": 0.7034, + "step": 221 + }, + { + "epoch": 0.1539528432732316, + "grad_norm": 0.4986501675683246, + "learning_rate": 3.0790568654646323e-06, + "loss": 0.6957, + "step": 222 + }, + { + "epoch": 0.15464632454923716, + "grad_norm": 0.4320858977385933, + "learning_rate": 3.0929264909847434e-06, + "loss": 0.6711, + "step": 223 + }, + { + "epoch": 0.1553398058252427, + "grad_norm": 0.3905585489893339, + "learning_rate": 3.1067961165048544e-06, + "loss": 0.6946, + "step": 224 + }, + { + "epoch": 0.15603328710124825, + "grad_norm": 0.5691385690117634, + "learning_rate": 3.120665742024965e-06, + "loss": 0.713, + "step": 225 + }, + { + "epoch": 0.15672676837725383, + "grad_norm": 0.4018553467780068, + "learning_rate": 3.134535367545077e-06, + "loss": 0.7723, + "step": 226 + }, + { + "epoch": 0.15742024965325938, + "grad_norm": 0.4341017943701606, + "learning_rate": 3.1484049930651877e-06, + "loss": 0.7205, + "step": 227 + }, + { + "epoch": 0.15811373092926492, + "grad_norm": 0.34455071912195556, + "learning_rate": 3.1622746185852987e-06, + "loss": 0.6464, + "step": 228 + }, + { + "epoch": 0.15880721220527047, + "grad_norm": 0.4890969754146096, + "learning_rate": 3.17614424410541e-06, + "loss": 0.7859, + "step": 229 + }, + { + "epoch": 0.15950069348127602, + "grad_norm": 0.4372650975732535, + "learning_rate": 3.1900138696255205e-06, + "loss": 0.7823, + "step": 230 + }, + { + "epoch": 0.16019417475728157, + "grad_norm": 0.3888984885548787, + "learning_rate": 3.2038834951456315e-06, + "loss": 0.7053, + "step": 231 + }, + { + "epoch": 0.1608876560332871, + "grad_norm": 0.4441693569558216, + "learning_rate": 3.2177531206657426e-06, + "loss": 0.8003, + "step": 232 + }, + { + "epoch": 0.16158113730929266, + "grad_norm": 0.45285413547070325, + "learning_rate": 3.2316227461858537e-06, + "loss": 0.7537, + "step": 233 + }, + { + "epoch": 0.1622746185852982, + "grad_norm": 0.3764247733055762, + "learning_rate": 3.2454923717059643e-06, + "loss": 0.6972, + "step": 234 + }, + { + "epoch": 0.16296809986130376, + "grad_norm": 0.3665788718235755, + "learning_rate": 3.2593619972260754e-06, + "loss": 0.7249, + "step": 235 + }, + { + "epoch": 0.1636615811373093, + "grad_norm": 0.3686969545881345, + "learning_rate": 3.2732316227461865e-06, + "loss": 0.6693, + "step": 236 + }, + { + "epoch": 0.16435506241331485, + "grad_norm": 0.3766909229645527, + "learning_rate": 3.287101248266297e-06, + "loss": 0.7325, + "step": 237 + }, + { + "epoch": 0.1650485436893204, + "grad_norm": 0.42314048885712235, + "learning_rate": 3.300970873786408e-06, + "loss": 0.7642, + "step": 238 + }, + { + "epoch": 0.16574202496532595, + "grad_norm": 0.4252000850307808, + "learning_rate": 3.3148404993065193e-06, + "loss": 0.677, + "step": 239 + }, + { + "epoch": 0.1664355062413315, + "grad_norm": 0.38912020789020796, + "learning_rate": 3.3287101248266303e-06, + "loss": 0.7272, + "step": 240 + }, + { + "epoch": 0.16712898751733704, + "grad_norm": 0.382307745645276, + "learning_rate": 3.342579750346741e-06, + "loss": 0.6513, + "step": 241 + }, + { + "epoch": 0.1678224687933426, + "grad_norm": 0.4527453287260334, + "learning_rate": 3.356449375866852e-06, + "loss": 0.7548, + "step": 242 + }, + { + "epoch": 0.16851595006934814, + "grad_norm": 0.38643663324749034, + "learning_rate": 3.370319001386963e-06, + "loss": 0.7054, + "step": 243 + }, + { + "epoch": 0.16920943134535368, + "grad_norm": 0.434433662005887, + "learning_rate": 3.3841886269070738e-06, + "loss": 0.7379, + "step": 244 + }, + { + "epoch": 0.16990291262135923, + "grad_norm": 0.41999529673546193, + "learning_rate": 3.398058252427185e-06, + "loss": 0.7369, + "step": 245 + }, + { + "epoch": 0.17059639389736478, + "grad_norm": 0.40543406439208945, + "learning_rate": 3.411927877947296e-06, + "loss": 0.7346, + "step": 246 + }, + { + "epoch": 0.17128987517337033, + "grad_norm": 0.44451792117385214, + "learning_rate": 3.425797503467407e-06, + "loss": 0.6726, + "step": 247 + }, + { + "epoch": 0.17198335644937587, + "grad_norm": 0.4370238537183958, + "learning_rate": 3.4396671289875176e-06, + "loss": 0.7119, + "step": 248 + }, + { + "epoch": 0.17267683772538142, + "grad_norm": 0.3618909161517529, + "learning_rate": 3.4535367545076287e-06, + "loss": 0.6303, + "step": 249 + }, + { + "epoch": 0.17337031900138697, + "grad_norm": 0.4362454159660753, + "learning_rate": 3.46740638002774e-06, + "loss": 0.6872, + "step": 250 + }, + { + "epoch": 0.17406380027739252, + "grad_norm": 0.40788222654825634, + "learning_rate": 3.4812760055478504e-06, + "loss": 0.7798, + "step": 251 + }, + { + "epoch": 0.17475728155339806, + "grad_norm": 0.40646895325408444, + "learning_rate": 3.4951456310679615e-06, + "loss": 0.7273, + "step": 252 + }, + { + "epoch": 0.1754507628294036, + "grad_norm": 0.3970649147498973, + "learning_rate": 3.5090152565880726e-06, + "loss": 0.7415, + "step": 253 + }, + { + "epoch": 0.17614424410540916, + "grad_norm": 0.8738028260247339, + "learning_rate": 3.5228848821081837e-06, + "loss": 0.7357, + "step": 254 + }, + { + "epoch": 0.1768377253814147, + "grad_norm": 0.4383888286551231, + "learning_rate": 3.5367545076282943e-06, + "loss": 0.7452, + "step": 255 + }, + { + "epoch": 0.17753120665742025, + "grad_norm": 0.4430590745242103, + "learning_rate": 3.5506241331484054e-06, + "loss": 0.6778, + "step": 256 + }, + { + "epoch": 0.1782246879334258, + "grad_norm": 0.39890463053500536, + "learning_rate": 3.5644937586685165e-06, + "loss": 0.7401, + "step": 257 + }, + { + "epoch": 0.17891816920943135, + "grad_norm": 0.39432385105373763, + "learning_rate": 3.578363384188627e-06, + "loss": 0.6875, + "step": 258 + }, + { + "epoch": 0.1796116504854369, + "grad_norm": 0.4614461680532051, + "learning_rate": 3.592233009708738e-06, + "loss": 0.7385, + "step": 259 + }, + { + "epoch": 0.18030513176144244, + "grad_norm": 0.46226064481570694, + "learning_rate": 3.6061026352288493e-06, + "loss": 0.6958, + "step": 260 + }, + { + "epoch": 0.180998613037448, + "grad_norm": 0.37990529350392716, + "learning_rate": 3.6199722607489603e-06, + "loss": 0.7773, + "step": 261 + }, + { + "epoch": 0.18169209431345354, + "grad_norm": 0.40180373286019033, + "learning_rate": 3.633841886269071e-06, + "loss": 0.7238, + "step": 262 + }, + { + "epoch": 0.1823855755894591, + "grad_norm": 0.40427696266686086, + "learning_rate": 3.647711511789182e-06, + "loss": 0.7307, + "step": 263 + }, + { + "epoch": 0.18307905686546463, + "grad_norm": 0.5184525439199673, + "learning_rate": 3.661581137309293e-06, + "loss": 0.7426, + "step": 264 + }, + { + "epoch": 0.18377253814147018, + "grad_norm": 0.43717853016028557, + "learning_rate": 3.6754507628294038e-06, + "loss": 0.7125, + "step": 265 + }, + { + "epoch": 0.18446601941747573, + "grad_norm": 0.3735704789665749, + "learning_rate": 3.689320388349515e-06, + "loss": 0.6825, + "step": 266 + }, + { + "epoch": 0.18515950069348128, + "grad_norm": 0.3950750781681265, + "learning_rate": 3.703190013869626e-06, + "loss": 0.6577, + "step": 267 + }, + { + "epoch": 0.18585298196948682, + "grad_norm": 0.40969662640678584, + "learning_rate": 3.717059639389737e-06, + "loss": 0.6832, + "step": 268 + }, + { + "epoch": 0.18654646324549237, + "grad_norm": 0.8433752708079951, + "learning_rate": 3.7309292649098476e-06, + "loss": 0.7512, + "step": 269 + }, + { + "epoch": 0.18723994452149792, + "grad_norm": 0.37305785462119795, + "learning_rate": 3.7447988904299587e-06, + "loss": 0.74, + "step": 270 + }, + { + "epoch": 0.18793342579750347, + "grad_norm": 0.3789588688093465, + "learning_rate": 3.7586685159500698e-06, + "loss": 0.7348, + "step": 271 + }, + { + "epoch": 0.18862690707350901, + "grad_norm": 0.3616849761320114, + "learning_rate": 3.7725381414701804e-06, + "loss": 0.6887, + "step": 272 + }, + { + "epoch": 0.18932038834951456, + "grad_norm": 0.5018112098723796, + "learning_rate": 3.7864077669902915e-06, + "loss": 0.6886, + "step": 273 + }, + { + "epoch": 0.1900138696255201, + "grad_norm": 0.36718542081499966, + "learning_rate": 3.8002773925104026e-06, + "loss": 0.6481, + "step": 274 + }, + { + "epoch": 0.19070735090152566, + "grad_norm": 0.3923365942952953, + "learning_rate": 3.8141470180305136e-06, + "loss": 0.733, + "step": 275 + }, + { + "epoch": 0.1914008321775312, + "grad_norm": 0.3735527871806657, + "learning_rate": 3.828016643550625e-06, + "loss": 0.7018, + "step": 276 + }, + { + "epoch": 0.19209431345353675, + "grad_norm": 0.4386730270594253, + "learning_rate": 3.841886269070735e-06, + "loss": 0.76, + "step": 277 + }, + { + "epoch": 0.1927877947295423, + "grad_norm": 0.37686535934554893, + "learning_rate": 3.855755894590846e-06, + "loss": 0.6579, + "step": 278 + }, + { + "epoch": 0.19348127600554785, + "grad_norm": 0.3866861443374644, + "learning_rate": 3.8696255201109575e-06, + "loss": 0.7, + "step": 279 + }, + { + "epoch": 0.1941747572815534, + "grad_norm": 0.5290966775461156, + "learning_rate": 3.883495145631068e-06, + "loss": 0.7431, + "step": 280 + }, + { + "epoch": 0.19486823855755894, + "grad_norm": 0.3962785445821349, + "learning_rate": 3.897364771151179e-06, + "loss": 0.7803, + "step": 281 + }, + { + "epoch": 0.1955617198335645, + "grad_norm": 0.3879033584112287, + "learning_rate": 3.91123439667129e-06, + "loss": 0.6738, + "step": 282 + }, + { + "epoch": 0.19625520110957004, + "grad_norm": 0.38041620346715294, + "learning_rate": 3.925104022191401e-06, + "loss": 0.6293, + "step": 283 + }, + { + "epoch": 0.19694868238557559, + "grad_norm": 0.4648867288637665, + "learning_rate": 3.938973647711512e-06, + "loss": 0.7474, + "step": 284 + }, + { + "epoch": 0.19764216366158113, + "grad_norm": 0.7660847778048131, + "learning_rate": 3.952843273231623e-06, + "loss": 0.7282, + "step": 285 + }, + { + "epoch": 0.19833564493758668, + "grad_norm": 0.4086871296085051, + "learning_rate": 3.966712898751734e-06, + "loss": 0.6426, + "step": 286 + }, + { + "epoch": 0.19902912621359223, + "grad_norm": 0.4164392495091013, + "learning_rate": 3.980582524271845e-06, + "loss": 0.7051, + "step": 287 + }, + { + "epoch": 0.19972260748959778, + "grad_norm": 0.41106476170361106, + "learning_rate": 3.994452149791956e-06, + "loss": 0.6931, + "step": 288 + }, + { + "epoch": 0.20041608876560332, + "grad_norm": 0.4077108909855236, + "learning_rate": 4.0083217753120665e-06, + "loss": 0.7315, + "step": 289 + }, + { + "epoch": 0.20110957004160887, + "grad_norm": 0.3482064528393862, + "learning_rate": 4.022191400832178e-06, + "loss": 0.5911, + "step": 290 + }, + { + "epoch": 0.20180305131761442, + "grad_norm": 0.37430326552593113, + "learning_rate": 4.036061026352289e-06, + "loss": 0.7335, + "step": 291 + }, + { + "epoch": 0.20249653259361997, + "grad_norm": 0.40684380286537025, + "learning_rate": 4.049930651872399e-06, + "loss": 0.6855, + "step": 292 + }, + { + "epoch": 0.2031900138696255, + "grad_norm": 0.360212933581968, + "learning_rate": 4.063800277392511e-06, + "loss": 0.6422, + "step": 293 + }, + { + "epoch": 0.20388349514563106, + "grad_norm": 0.39792381205101135, + "learning_rate": 4.0776699029126215e-06, + "loss": 0.721, + "step": 294 + }, + { + "epoch": 0.2045769764216366, + "grad_norm": 0.4453439910657429, + "learning_rate": 4.091539528432732e-06, + "loss": 0.7601, + "step": 295 + }, + { + "epoch": 0.20527045769764216, + "grad_norm": 0.34831754314449653, + "learning_rate": 4.105409153952844e-06, + "loss": 0.7018, + "step": 296 + }, + { + "epoch": 0.2059639389736477, + "grad_norm": 0.3898897307469491, + "learning_rate": 4.119278779472954e-06, + "loss": 0.6699, + "step": 297 + }, + { + "epoch": 0.20665742024965325, + "grad_norm": 0.40060497596979705, + "learning_rate": 4.133148404993066e-06, + "loss": 0.7531, + "step": 298 + }, + { + "epoch": 0.2073509015256588, + "grad_norm": 0.40441239042015287, + "learning_rate": 4.1470180305131764e-06, + "loss": 0.7485, + "step": 299 + }, + { + "epoch": 0.20804438280166435, + "grad_norm": 0.4237334246726315, + "learning_rate": 4.160887656033287e-06, + "loss": 0.6922, + "step": 300 + }, + { + "epoch": 0.2087378640776699, + "grad_norm": 0.3741007592381228, + "learning_rate": 4.1747572815533986e-06, + "loss": 0.6715, + "step": 301 + }, + { + "epoch": 0.20943134535367544, + "grad_norm": 0.5389226606185195, + "learning_rate": 4.188626907073509e-06, + "loss": 0.6939, + "step": 302 + }, + { + "epoch": 0.210124826629681, + "grad_norm": 0.3862773522692115, + "learning_rate": 4.20249653259362e-06, + "loss": 0.7199, + "step": 303 + }, + { + "epoch": 0.21081830790568654, + "grad_norm": 0.39457682805996036, + "learning_rate": 4.216366158113731e-06, + "loss": 0.7211, + "step": 304 + }, + { + "epoch": 0.21151178918169208, + "grad_norm": 0.3924237038966114, + "learning_rate": 4.230235783633842e-06, + "loss": 0.6458, + "step": 305 + }, + { + "epoch": 0.21220527045769763, + "grad_norm": 0.41825071907435474, + "learning_rate": 4.244105409153953e-06, + "loss": 0.754, + "step": 306 + }, + { + "epoch": 0.21289875173370318, + "grad_norm": 0.4297642196475537, + "learning_rate": 4.257975034674064e-06, + "loss": 0.6654, + "step": 307 + }, + { + "epoch": 0.21359223300970873, + "grad_norm": 0.3520622437956551, + "learning_rate": 4.271844660194175e-06, + "loss": 0.643, + "step": 308 + }, + { + "epoch": 0.21428571428571427, + "grad_norm": 0.4044597557522868, + "learning_rate": 4.2857142857142855e-06, + "loss": 0.7518, + "step": 309 + }, + { + "epoch": 0.21497919556171982, + "grad_norm": 0.3713455415699211, + "learning_rate": 4.299583911234397e-06, + "loss": 0.6956, + "step": 310 + }, + { + "epoch": 0.21567267683772537, + "grad_norm": 0.42093578154492384, + "learning_rate": 4.313453536754508e-06, + "loss": 0.6999, + "step": 311 + }, + { + "epoch": 0.21636615811373092, + "grad_norm": 0.3948856956442142, + "learning_rate": 4.327323162274619e-06, + "loss": 0.7453, + "step": 312 + }, + { + "epoch": 0.21705963938973646, + "grad_norm": 0.36924587597247616, + "learning_rate": 4.34119278779473e-06, + "loss": 0.7046, + "step": 313 + }, + { + "epoch": 0.217753120665742, + "grad_norm": 0.37060128569933265, + "learning_rate": 4.35506241331484e-06, + "loss": 0.6666, + "step": 314 + }, + { + "epoch": 0.21844660194174756, + "grad_norm": 0.3817264949494108, + "learning_rate": 4.368932038834952e-06, + "loss": 0.7588, + "step": 315 + }, + { + "epoch": 0.21914008321775313, + "grad_norm": 0.44279359077385344, + "learning_rate": 4.382801664355063e-06, + "loss": 0.7247, + "step": 316 + }, + { + "epoch": 0.21983356449375868, + "grad_norm": 0.41286292621557696, + "learning_rate": 4.396671289875174e-06, + "loss": 0.722, + "step": 317 + }, + { + "epoch": 0.22052704576976423, + "grad_norm": 0.3665446610582214, + "learning_rate": 4.410540915395285e-06, + "loss": 0.7042, + "step": 318 + }, + { + "epoch": 0.22122052704576978, + "grad_norm": 0.4064325674582958, + "learning_rate": 4.424410540915396e-06, + "loss": 0.6911, + "step": 319 + }, + { + "epoch": 0.22191400832177532, + "grad_norm": 0.3527947659626731, + "learning_rate": 4.438280166435507e-06, + "loss": 0.6782, + "step": 320 + }, + { + "epoch": 0.22260748959778087, + "grad_norm": 0.4122497012373624, + "learning_rate": 4.4521497919556175e-06, + "loss": 0.6382, + "step": 321 + }, + { + "epoch": 0.22330097087378642, + "grad_norm": 0.4171792992825532, + "learning_rate": 4.466019417475729e-06, + "loss": 0.7077, + "step": 322 + }, + { + "epoch": 0.22399445214979197, + "grad_norm": 0.4009641540334108, + "learning_rate": 4.47988904299584e-06, + "loss": 0.6752, + "step": 323 + }, + { + "epoch": 0.22468793342579751, + "grad_norm": 0.3683855640327117, + "learning_rate": 4.49375866851595e-06, + "loss": 0.6811, + "step": 324 + }, + { + "epoch": 0.22538141470180306, + "grad_norm": 0.33393299149542643, + "learning_rate": 4.507628294036062e-06, + "loss": 0.6586, + "step": 325 + }, + { + "epoch": 0.2260748959778086, + "grad_norm": 0.6062995504813962, + "learning_rate": 4.5214979195561724e-06, + "loss": 0.6465, + "step": 326 + }, + { + "epoch": 0.22676837725381416, + "grad_norm": 0.3973099576565524, + "learning_rate": 4.535367545076284e-06, + "loss": 0.7261, + "step": 327 + }, + { + "epoch": 0.2274618585298197, + "grad_norm": 0.3908901589205917, + "learning_rate": 4.5492371705963946e-06, + "loss": 0.684, + "step": 328 + }, + { + "epoch": 0.22815533980582525, + "grad_norm": 0.3775681435948479, + "learning_rate": 4.563106796116505e-06, + "loss": 0.673, + "step": 329 + }, + { + "epoch": 0.2288488210818308, + "grad_norm": 0.3900280063158446, + "learning_rate": 4.576976421636617e-06, + "loss": 0.6897, + "step": 330 + }, + { + "epoch": 0.22954230235783635, + "grad_norm": 0.37078725700628873, + "learning_rate": 4.590846047156727e-06, + "loss": 0.6874, + "step": 331 + }, + { + "epoch": 0.2302357836338419, + "grad_norm": 0.38792361557953897, + "learning_rate": 4.604715672676838e-06, + "loss": 0.7129, + "step": 332 + }, + { + "epoch": 0.23092926490984744, + "grad_norm": 0.38616002138886846, + "learning_rate": 4.6185852981969495e-06, + "loss": 0.6745, + "step": 333 + }, + { + "epoch": 0.231622746185853, + "grad_norm": 0.32852732250748046, + "learning_rate": 4.63245492371706e-06, + "loss": 0.6635, + "step": 334 + }, + { + "epoch": 0.23231622746185854, + "grad_norm": 0.4457611585675984, + "learning_rate": 4.646324549237171e-06, + "loss": 0.7158, + "step": 335 + }, + { + "epoch": 0.23300970873786409, + "grad_norm": 0.40236756258934586, + "learning_rate": 4.660194174757282e-06, + "loss": 0.7616, + "step": 336 + }, + { + "epoch": 0.23370319001386963, + "grad_norm": 0.3599443137364842, + "learning_rate": 4.674063800277393e-06, + "loss": 0.6883, + "step": 337 + }, + { + "epoch": 0.23439667128987518, + "grad_norm": 0.41547414287949047, + "learning_rate": 4.687933425797504e-06, + "loss": 0.7555, + "step": 338 + }, + { + "epoch": 0.23509015256588073, + "grad_norm": 0.37873565331028053, + "learning_rate": 4.701803051317615e-06, + "loss": 0.6402, + "step": 339 + }, + { + "epoch": 0.23578363384188628, + "grad_norm": 0.39684279481484297, + "learning_rate": 4.715672676837726e-06, + "loss": 0.6315, + "step": 340 + }, + { + "epoch": 0.23647711511789182, + "grad_norm": 0.6091528195070264, + "learning_rate": 4.729542302357837e-06, + "loss": 0.7367, + "step": 341 + }, + { + "epoch": 0.23717059639389737, + "grad_norm": 0.6825847378059929, + "learning_rate": 4.743411927877948e-06, + "loss": 0.6655, + "step": 342 + }, + { + "epoch": 0.23786407766990292, + "grad_norm": 0.43983874334583867, + "learning_rate": 4.7572815533980585e-06, + "loss": 0.6389, + "step": 343 + }, + { + "epoch": 0.23855755894590847, + "grad_norm": 0.39966578241418427, + "learning_rate": 4.77115117891817e-06, + "loss": 0.7489, + "step": 344 + }, + { + "epoch": 0.239251040221914, + "grad_norm": 0.43569735103122237, + "learning_rate": 4.785020804438281e-06, + "loss": 0.668, + "step": 345 + }, + { + "epoch": 0.23994452149791956, + "grad_norm": 0.42173264369566943, + "learning_rate": 4.798890429958391e-06, + "loss": 0.6116, + "step": 346 + }, + { + "epoch": 0.2406380027739251, + "grad_norm": 0.4233628116294417, + "learning_rate": 4.812760055478503e-06, + "loss": 0.6558, + "step": 347 + }, + { + "epoch": 0.24133148404993066, + "grad_norm": 0.37286416211015233, + "learning_rate": 4.8266296809986135e-06, + "loss": 0.6448, + "step": 348 + }, + { + "epoch": 0.2420249653259362, + "grad_norm": 0.5540498950332101, + "learning_rate": 4.840499306518724e-06, + "loss": 0.6845, + "step": 349 + }, + { + "epoch": 0.24271844660194175, + "grad_norm": 0.39698146940563805, + "learning_rate": 4.854368932038836e-06, + "loss": 0.658, + "step": 350 + }, + { + "epoch": 0.2434119278779473, + "grad_norm": 0.3760647404205541, + "learning_rate": 4.868238557558946e-06, + "loss": 0.6862, + "step": 351 + }, + { + "epoch": 0.24410540915395285, + "grad_norm": 0.3821808346275461, + "learning_rate": 4.882108183079057e-06, + "loss": 0.6398, + "step": 352 + }, + { + "epoch": 0.2447988904299584, + "grad_norm": 0.5129475677657719, + "learning_rate": 4.895977808599168e-06, + "loss": 0.6635, + "step": 353 + }, + { + "epoch": 0.24549237170596394, + "grad_norm": 0.4342552380988674, + "learning_rate": 4.909847434119279e-06, + "loss": 0.6356, + "step": 354 + }, + { + "epoch": 0.2461858529819695, + "grad_norm": 0.5138925585228701, + "learning_rate": 4.9237170596393906e-06, + "loss": 0.6717, + "step": 355 + }, + { + "epoch": 0.24687933425797504, + "grad_norm": 0.3604123159012774, + "learning_rate": 4.937586685159501e-06, + "loss": 0.6659, + "step": 356 + }, + { + "epoch": 0.24757281553398058, + "grad_norm": 0.3908145023887402, + "learning_rate": 4.951456310679612e-06, + "loss": 0.6907, + "step": 357 + }, + { + "epoch": 0.24826629680998613, + "grad_norm": 0.49042164821212136, + "learning_rate": 4.965325936199723e-06, + "loss": 0.7564, + "step": 358 + }, + { + "epoch": 0.24895977808599168, + "grad_norm": 0.3501434572941209, + "learning_rate": 4.979195561719834e-06, + "loss": 0.6338, + "step": 359 + }, + { + "epoch": 0.24965325936199723, + "grad_norm": 0.3830884675531958, + "learning_rate": 4.993065187239945e-06, + "loss": 0.7155, + "step": 360 + }, + { + "epoch": 0.2503467406380028, + "grad_norm": 0.3725694539787639, + "learning_rate": 5.006934812760056e-06, + "loss": 0.6294, + "step": 361 + }, + { + "epoch": 0.2510402219140083, + "grad_norm": 0.34617132963619734, + "learning_rate": 5.020804438280167e-06, + "loss": 0.6415, + "step": 362 + }, + { + "epoch": 0.25173370319001387, + "grad_norm": 0.3683478684346332, + "learning_rate": 5.0346740638002775e-06, + "loss": 0.6448, + "step": 363 + }, + { + "epoch": 0.2524271844660194, + "grad_norm": 0.8710952742326363, + "learning_rate": 5.048543689320389e-06, + "loss": 0.6435, + "step": 364 + }, + { + "epoch": 0.25312066574202496, + "grad_norm": 0.439240482058905, + "learning_rate": 5.0624133148405e-06, + "loss": 0.7665, + "step": 365 + }, + { + "epoch": 0.2538141470180305, + "grad_norm": 0.33216127526038036, + "learning_rate": 5.07628294036061e-06, + "loss": 0.6297, + "step": 366 + }, + { + "epoch": 0.25450762829403606, + "grad_norm": 0.5775981024418212, + "learning_rate": 5.090152565880722e-06, + "loss": 0.6666, + "step": 367 + }, + { + "epoch": 0.2552011095700416, + "grad_norm": 0.38882938462702377, + "learning_rate": 5.104022191400832e-06, + "loss": 0.7085, + "step": 368 + }, + { + "epoch": 0.25589459084604715, + "grad_norm": 0.33482058623909816, + "learning_rate": 5.117891816920944e-06, + "loss": 0.6187, + "step": 369 + }, + { + "epoch": 0.2565880721220527, + "grad_norm": 0.33162870234823194, + "learning_rate": 5.1317614424410545e-06, + "loss": 0.6328, + "step": 370 + }, + { + "epoch": 0.25728155339805825, + "grad_norm": 0.3896860210583244, + "learning_rate": 5.145631067961165e-06, + "loss": 0.6681, + "step": 371 + }, + { + "epoch": 0.2579750346740638, + "grad_norm": 0.7454938688896714, + "learning_rate": 5.159500693481277e-06, + "loss": 0.6656, + "step": 372 + }, + { + "epoch": 0.25866851595006934, + "grad_norm": 0.3747009984960349, + "learning_rate": 5.173370319001387e-06, + "loss": 0.6885, + "step": 373 + }, + { + "epoch": 0.2593619972260749, + "grad_norm": 0.3963573968649995, + "learning_rate": 5.187239944521498e-06, + "loss": 0.7195, + "step": 374 + }, + { + "epoch": 0.26005547850208044, + "grad_norm": 0.3238548069264673, + "learning_rate": 5.2011095700416095e-06, + "loss": 0.6326, + "step": 375 + }, + { + "epoch": 0.260748959778086, + "grad_norm": 0.6022470252979756, + "learning_rate": 5.21497919556172e-06, + "loss": 0.7065, + "step": 376 + }, + { + "epoch": 0.26144244105409153, + "grad_norm": 0.37864127639998046, + "learning_rate": 5.228848821081831e-06, + "loss": 0.6971, + "step": 377 + }, + { + "epoch": 0.2621359223300971, + "grad_norm": 0.40581994582978925, + "learning_rate": 5.242718446601942e-06, + "loss": 0.6546, + "step": 378 + }, + { + "epoch": 0.26282940360610263, + "grad_norm": 0.43523343135296255, + "learning_rate": 5.256588072122053e-06, + "loss": 0.6614, + "step": 379 + }, + { + "epoch": 0.2635228848821082, + "grad_norm": 0.4429279022870644, + "learning_rate": 5.2704576976421636e-06, + "loss": 0.6547, + "step": 380 + }, + { + "epoch": 0.2642163661581137, + "grad_norm": 0.34918946570697734, + "learning_rate": 5.284327323162275e-06, + "loss": 0.5975, + "step": 381 + }, + { + "epoch": 0.26490984743411927, + "grad_norm": 0.3744351134584379, + "learning_rate": 5.298196948682386e-06, + "loss": 0.5928, + "step": 382 + }, + { + "epoch": 0.2656033287101248, + "grad_norm": 0.3854516733573264, + "learning_rate": 5.312066574202497e-06, + "loss": 0.6574, + "step": 383 + }, + { + "epoch": 0.26629680998613037, + "grad_norm": 0.3650426392099008, + "learning_rate": 5.325936199722608e-06, + "loss": 0.6325, + "step": 384 + }, + { + "epoch": 0.2669902912621359, + "grad_norm": 0.3663888776279793, + "learning_rate": 5.3398058252427185e-06, + "loss": 0.6125, + "step": 385 + }, + { + "epoch": 0.26768377253814146, + "grad_norm": 0.4243630833933148, + "learning_rate": 5.35367545076283e-06, + "loss": 0.6597, + "step": 386 + }, + { + "epoch": 0.268377253814147, + "grad_norm": 0.3744910129430353, + "learning_rate": 5.367545076282941e-06, + "loss": 0.6109, + "step": 387 + }, + { + "epoch": 0.26907073509015256, + "grad_norm": 0.34465466744243556, + "learning_rate": 5.381414701803051e-06, + "loss": 0.6416, + "step": 388 + }, + { + "epoch": 0.2697642163661581, + "grad_norm": 0.41674605693511635, + "learning_rate": 5.395284327323163e-06, + "loss": 0.6233, + "step": 389 + }, + { + "epoch": 0.27045769764216365, + "grad_norm": 0.38060760426150736, + "learning_rate": 5.4091539528432735e-06, + "loss": 0.7213, + "step": 390 + }, + { + "epoch": 0.2711511789181692, + "grad_norm": 0.40123837020401165, + "learning_rate": 5.423023578363384e-06, + "loss": 0.6753, + "step": 391 + }, + { + "epoch": 0.27184466019417475, + "grad_norm": 0.36451925116080663, + "learning_rate": 5.436893203883496e-06, + "loss": 0.6328, + "step": 392 + }, + { + "epoch": 0.2725381414701803, + "grad_norm": 0.36749822505626917, + "learning_rate": 5.450762829403606e-06, + "loss": 0.6341, + "step": 393 + }, + { + "epoch": 0.27323162274618584, + "grad_norm": 0.4096310577786831, + "learning_rate": 5.464632454923717e-06, + "loss": 0.6723, + "step": 394 + }, + { + "epoch": 0.2739251040221914, + "grad_norm": 0.4237347819620758, + "learning_rate": 5.478502080443828e-06, + "loss": 0.7211, + "step": 395 + }, + { + "epoch": 0.27461858529819694, + "grad_norm": 0.33936564676239156, + "learning_rate": 5.492371705963939e-06, + "loss": 0.6453, + "step": 396 + }, + { + "epoch": 0.2753120665742025, + "grad_norm": 0.3658780802205608, + "learning_rate": 5.5062413314840505e-06, + "loss": 0.7085, + "step": 397 + }, + { + "epoch": 0.27600554785020803, + "grad_norm": 0.3730632292217037, + "learning_rate": 5.520110957004161e-06, + "loss": 0.6712, + "step": 398 + }, + { + "epoch": 0.2766990291262136, + "grad_norm": 0.3795485729408459, + "learning_rate": 5.533980582524272e-06, + "loss": 0.717, + "step": 399 + }, + { + "epoch": 0.27739251040221913, + "grad_norm": 0.3774321745578124, + "learning_rate": 5.547850208044383e-06, + "loss": 0.6526, + "step": 400 + }, + { + "epoch": 0.2780859916782247, + "grad_norm": 0.3746065542373678, + "learning_rate": 5.561719833564494e-06, + "loss": 0.654, + "step": 401 + }, + { + "epoch": 0.2787794729542302, + "grad_norm": 0.39011429530147207, + "learning_rate": 5.575589459084605e-06, + "loss": 0.7052, + "step": 402 + }, + { + "epoch": 0.27947295423023577, + "grad_norm": 0.4119729248892106, + "learning_rate": 5.589459084604716e-06, + "loss": 0.6616, + "step": 403 + }, + { + "epoch": 0.2801664355062413, + "grad_norm": 0.4143740013594362, + "learning_rate": 5.603328710124827e-06, + "loss": 0.7257, + "step": 404 + }, + { + "epoch": 0.28085991678224687, + "grad_norm": 0.41908972692371427, + "learning_rate": 5.6171983356449374e-06, + "loss": 0.7525, + "step": 405 + }, + { + "epoch": 0.2815533980582524, + "grad_norm": 0.4165549426702378, + "learning_rate": 5.631067961165049e-06, + "loss": 0.6743, + "step": 406 + }, + { + "epoch": 0.28224687933425796, + "grad_norm": 0.4154829042919571, + "learning_rate": 5.6449375866851596e-06, + "loss": 0.6862, + "step": 407 + }, + { + "epoch": 0.2829403606102635, + "grad_norm": 0.5072686293952215, + "learning_rate": 5.65880721220527e-06, + "loss": 0.6137, + "step": 408 + }, + { + "epoch": 0.28363384188626906, + "grad_norm": 0.37917909862888977, + "learning_rate": 5.672676837725382e-06, + "loss": 0.6845, + "step": 409 + }, + { + "epoch": 0.2843273231622746, + "grad_norm": 0.3613478033568616, + "learning_rate": 5.686546463245492e-06, + "loss": 0.6435, + "step": 410 + }, + { + "epoch": 0.28502080443828015, + "grad_norm": 0.4034578650552871, + "learning_rate": 5.700416088765604e-06, + "loss": 0.6732, + "step": 411 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.3779341136244697, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.6785, + "step": 412 + }, + { + "epoch": 0.28640776699029125, + "grad_norm": 0.3793816313097279, + "learning_rate": 5.728155339805825e-06, + "loss": 0.5901, + "step": 413 + }, + { + "epoch": 0.2871012482662968, + "grad_norm": 0.3964948080285677, + "learning_rate": 5.742024965325937e-06, + "loss": 0.7356, + "step": 414 + }, + { + "epoch": 0.28779472954230234, + "grad_norm": 0.35077158308234324, + "learning_rate": 5.755894590846047e-06, + "loss": 0.6466, + "step": 415 + }, + { + "epoch": 0.2884882108183079, + "grad_norm": 0.3737881400105352, + "learning_rate": 5.769764216366158e-06, + "loss": 0.7318, + "step": 416 + }, + { + "epoch": 0.28918169209431344, + "grad_norm": 0.3420061085007688, + "learning_rate": 5.7836338418862695e-06, + "loss": 0.6436, + "step": 417 + }, + { + "epoch": 0.289875173370319, + "grad_norm": 0.3835060764229847, + "learning_rate": 5.79750346740638e-06, + "loss": 0.7166, + "step": 418 + }, + { + "epoch": 0.29056865464632453, + "grad_norm": 0.4041934642032845, + "learning_rate": 5.811373092926491e-06, + "loss": 0.6385, + "step": 419 + }, + { + "epoch": 0.2912621359223301, + "grad_norm": 0.38980872524931653, + "learning_rate": 5.825242718446602e-06, + "loss": 0.6625, + "step": 420 + }, + { + "epoch": 0.2919556171983356, + "grad_norm": 0.3926345259523608, + "learning_rate": 5.839112343966713e-06, + "loss": 0.6889, + "step": 421 + }, + { + "epoch": 0.2926490984743412, + "grad_norm": 0.387207170404907, + "learning_rate": 5.8529819694868235e-06, + "loss": 0.6491, + "step": 422 + }, + { + "epoch": 0.2933425797503467, + "grad_norm": 0.47073473437328256, + "learning_rate": 5.866851595006935e-06, + "loss": 0.7384, + "step": 423 + }, + { + "epoch": 0.29403606102635227, + "grad_norm": 0.3829161705241466, + "learning_rate": 5.880721220527046e-06, + "loss": 0.6238, + "step": 424 + }, + { + "epoch": 0.2947295423023578, + "grad_norm": 0.3749217356284979, + "learning_rate": 5.894590846047157e-06, + "loss": 0.6202, + "step": 425 + }, + { + "epoch": 0.29542302357836336, + "grad_norm": 0.3795656266908337, + "learning_rate": 5.908460471567268e-06, + "loss": 0.5814, + "step": 426 + }, + { + "epoch": 0.2961165048543689, + "grad_norm": 0.3352327571349482, + "learning_rate": 5.9223300970873785e-06, + "loss": 0.6219, + "step": 427 + }, + { + "epoch": 0.29680998613037446, + "grad_norm": 0.45680581115880275, + "learning_rate": 5.93619972260749e-06, + "loss": 0.6776, + "step": 428 + }, + { + "epoch": 0.29750346740638, + "grad_norm": 0.35900628684279645, + "learning_rate": 5.950069348127601e-06, + "loss": 0.5489, + "step": 429 + }, + { + "epoch": 0.29819694868238555, + "grad_norm": 0.36206425800547715, + "learning_rate": 5.963938973647711e-06, + "loss": 0.659, + "step": 430 + }, + { + "epoch": 0.2988904299583911, + "grad_norm": 0.37646322689185413, + "learning_rate": 5.977808599167823e-06, + "loss": 0.6916, + "step": 431 + }, + { + "epoch": 0.29958391123439665, + "grad_norm": 0.37249710905483346, + "learning_rate": 5.9916782246879334e-06, + "loss": 0.6882, + "step": 432 + }, + { + "epoch": 0.3002773925104022, + "grad_norm": 0.34689884606211574, + "learning_rate": 6.005547850208044e-06, + "loss": 0.6101, + "step": 433 + }, + { + "epoch": 0.30097087378640774, + "grad_norm": 0.34496297242635743, + "learning_rate": 6.0194174757281556e-06, + "loss": 0.6395, + "step": 434 + }, + { + "epoch": 0.3016643550624133, + "grad_norm": 0.3837297230084832, + "learning_rate": 6.033287101248266e-06, + "loss": 0.6713, + "step": 435 + }, + { + "epoch": 0.30235783633841884, + "grad_norm": 0.3765465485638321, + "learning_rate": 6.047156726768377e-06, + "loss": 0.6289, + "step": 436 + }, + { + "epoch": 0.3030513176144244, + "grad_norm": 0.38880642643366503, + "learning_rate": 6.061026352288488e-06, + "loss": 0.6224, + "step": 437 + }, + { + "epoch": 0.30374479889042993, + "grad_norm": 0.3426690228325314, + "learning_rate": 6.074895977808599e-06, + "loss": 0.6254, + "step": 438 + }, + { + "epoch": 0.3044382801664355, + "grad_norm": 0.36772812499968494, + "learning_rate": 6.0887656033287105e-06, + "loss": 0.5951, + "step": 439 + }, + { + "epoch": 0.30513176144244103, + "grad_norm": 0.49899171788191177, + "learning_rate": 6.102635228848821e-06, + "loss": 0.6773, + "step": 440 + }, + { + "epoch": 0.3058252427184466, + "grad_norm": 0.3834180695542531, + "learning_rate": 6.116504854368932e-06, + "loss": 0.7215, + "step": 441 + }, + { + "epoch": 0.3065187239944521, + "grad_norm": 0.36106603703063916, + "learning_rate": 6.130374479889043e-06, + "loss": 0.6376, + "step": 442 + }, + { + "epoch": 0.30721220527045767, + "grad_norm": 0.3710802833985127, + "learning_rate": 6.144244105409154e-06, + "loss": 0.6756, + "step": 443 + }, + { + "epoch": 0.3079056865464632, + "grad_norm": 0.3823448067450143, + "learning_rate": 6.158113730929265e-06, + "loss": 0.6882, + "step": 444 + }, + { + "epoch": 0.30859916782246877, + "grad_norm": 0.3625031250425025, + "learning_rate": 6.171983356449376e-06, + "loss": 0.5832, + "step": 445 + }, + { + "epoch": 0.3092926490984743, + "grad_norm": 0.328308299367241, + "learning_rate": 6.185852981969487e-06, + "loss": 0.5977, + "step": 446 + }, + { + "epoch": 0.30998613037447986, + "grad_norm": 0.34092776179197676, + "learning_rate": 6.199722607489597e-06, + "loss": 0.5933, + "step": 447 + }, + { + "epoch": 0.3106796116504854, + "grad_norm": 0.42115580709489114, + "learning_rate": 6.213592233009709e-06, + "loss": 0.6959, + "step": 448 + }, + { + "epoch": 0.31137309292649096, + "grad_norm": 0.38627872596079266, + "learning_rate": 6.2274618585298195e-06, + "loss": 0.6929, + "step": 449 + }, + { + "epoch": 0.3120665742024965, + "grad_norm": 0.3259695951514646, + "learning_rate": 6.24133148404993e-06, + "loss": 0.6951, + "step": 450 + }, + { + "epoch": 0.3127600554785021, + "grad_norm": 0.37187630418518997, + "learning_rate": 6.2552011095700425e-06, + "loss": 0.6646, + "step": 451 + }, + { + "epoch": 0.31345353675450766, + "grad_norm": 0.3397739273074188, + "learning_rate": 6.269070735090154e-06, + "loss": 0.6664, + "step": 452 + }, + { + "epoch": 0.3141470180305132, + "grad_norm": 0.3907456233974298, + "learning_rate": 6.282940360610265e-06, + "loss": 0.6513, + "step": 453 + }, + { + "epoch": 0.31484049930651875, + "grad_norm": 0.33685206683928487, + "learning_rate": 6.296809986130375e-06, + "loss": 0.6131, + "step": 454 + }, + { + "epoch": 0.3155339805825243, + "grad_norm": 0.5047864328561893, + "learning_rate": 6.310679611650487e-06, + "loss": 0.5541, + "step": 455 + }, + { + "epoch": 0.31622746185852985, + "grad_norm": 0.3622958618519615, + "learning_rate": 6.3245492371705975e-06, + "loss": 0.6889, + "step": 456 + }, + { + "epoch": 0.3169209431345354, + "grad_norm": 0.3706772455463886, + "learning_rate": 6.338418862690708e-06, + "loss": 0.7271, + "step": 457 + }, + { + "epoch": 0.31761442441054094, + "grad_norm": 0.35327107215079073, + "learning_rate": 6.35228848821082e-06, + "loss": 0.6879, + "step": 458 + }, + { + "epoch": 0.3183079056865465, + "grad_norm": 0.43921428412042124, + "learning_rate": 6.36615811373093e-06, + "loss": 0.6329, + "step": 459 + }, + { + "epoch": 0.31900138696255204, + "grad_norm": 0.38468429534426213, + "learning_rate": 6.380027739251041e-06, + "loss": 0.6341, + "step": 460 + }, + { + "epoch": 0.3196948682385576, + "grad_norm": 0.34535486919600455, + "learning_rate": 6.393897364771152e-06, + "loss": 0.6254, + "step": 461 + }, + { + "epoch": 0.32038834951456313, + "grad_norm": 0.3635102675515314, + "learning_rate": 6.407766990291263e-06, + "loss": 0.6085, + "step": 462 + }, + { + "epoch": 0.3210818307905687, + "grad_norm": 0.3638395408480024, + "learning_rate": 6.421636615811374e-06, + "loss": 0.6342, + "step": 463 + }, + { + "epoch": 0.3217753120665742, + "grad_norm": 0.4335637077465101, + "learning_rate": 6.435506241331485e-06, + "loss": 0.6225, + "step": 464 + }, + { + "epoch": 0.3224687933425798, + "grad_norm": 0.45472390117370814, + "learning_rate": 6.449375866851596e-06, + "loss": 0.6208, + "step": 465 + }, + { + "epoch": 0.3231622746185853, + "grad_norm": 0.38973500636958547, + "learning_rate": 6.463245492371707e-06, + "loss": 0.6756, + "step": 466 + }, + { + "epoch": 0.32385575589459087, + "grad_norm": 0.4103960632318212, + "learning_rate": 6.477115117891818e-06, + "loss": 0.6527, + "step": 467 + }, + { + "epoch": 0.3245492371705964, + "grad_norm": 0.3870530980825968, + "learning_rate": 6.490984743411929e-06, + "loss": 0.6946, + "step": 468 + }, + { + "epoch": 0.32524271844660196, + "grad_norm": 0.3788150859492285, + "learning_rate": 6.50485436893204e-06, + "loss": 0.659, + "step": 469 + }, + { + "epoch": 0.3259361997226075, + "grad_norm": 0.36576904727712745, + "learning_rate": 6.518723994452151e-06, + "loss": 0.6075, + "step": 470 + }, + { + "epoch": 0.32662968099861306, + "grad_norm": 0.38197827075641066, + "learning_rate": 6.5325936199722614e-06, + "loss": 0.6796, + "step": 471 + }, + { + "epoch": 0.3273231622746186, + "grad_norm": 0.35198341424500457, + "learning_rate": 6.546463245492373e-06, + "loss": 0.6248, + "step": 472 + }, + { + "epoch": 0.32801664355062415, + "grad_norm": 0.3545850515599311, + "learning_rate": 6.560332871012484e-06, + "loss": 0.6155, + "step": 473 + }, + { + "epoch": 0.3287101248266297, + "grad_norm": 0.34892181852229914, + "learning_rate": 6.574202496532594e-06, + "loss": 0.6337, + "step": 474 + }, + { + "epoch": 0.32940360610263525, + "grad_norm": 0.3499097210355433, + "learning_rate": 6.588072122052706e-06, + "loss": 0.609, + "step": 475 + }, + { + "epoch": 0.3300970873786408, + "grad_norm": 0.3670643965397277, + "learning_rate": 6.601941747572816e-06, + "loss": 0.6147, + "step": 476 + }, + { + "epoch": 0.33079056865464634, + "grad_norm": 0.36863552844334313, + "learning_rate": 6.615811373092927e-06, + "loss": 0.6348, + "step": 477 + }, + { + "epoch": 0.3314840499306519, + "grad_norm": 0.47263933356334575, + "learning_rate": 6.6296809986130385e-06, + "loss": 0.6608, + "step": 478 + }, + { + "epoch": 0.33217753120665744, + "grad_norm": 0.3895357643648736, + "learning_rate": 6.643550624133149e-06, + "loss": 0.6418, + "step": 479 + }, + { + "epoch": 0.332871012482663, + "grad_norm": 0.48141877638096237, + "learning_rate": 6.657420249653261e-06, + "loss": 0.6115, + "step": 480 + }, + { + "epoch": 0.33356449375866853, + "grad_norm": 0.36426459634017727, + "learning_rate": 6.671289875173371e-06, + "loss": 0.6569, + "step": 481 + }, + { + "epoch": 0.3342579750346741, + "grad_norm": 0.35593652995273567, + "learning_rate": 6.685159500693482e-06, + "loss": 0.5888, + "step": 482 + }, + { + "epoch": 0.33495145631067963, + "grad_norm": 0.3406668588484618, + "learning_rate": 6.6990291262135935e-06, + "loss": 0.6684, + "step": 483 + }, + { + "epoch": 0.3356449375866852, + "grad_norm": 0.39969595043710154, + "learning_rate": 6.712898751733704e-06, + "loss": 0.5995, + "step": 484 + }, + { + "epoch": 0.3363384188626907, + "grad_norm": 0.39264539382428926, + "learning_rate": 6.726768377253815e-06, + "loss": 0.6629, + "step": 485 + }, + { + "epoch": 0.33703190013869627, + "grad_norm": 0.3546116714856907, + "learning_rate": 6.740638002773926e-06, + "loss": 0.7072, + "step": 486 + }, + { + "epoch": 0.3377253814147018, + "grad_norm": 0.33241502234836434, + "learning_rate": 6.754507628294037e-06, + "loss": 0.6037, + "step": 487 + }, + { + "epoch": 0.33841886269070737, + "grad_norm": 0.4155281461865311, + "learning_rate": 6.7683772538141476e-06, + "loss": 0.667, + "step": 488 + }, + { + "epoch": 0.3391123439667129, + "grad_norm": 0.49872886700308955, + "learning_rate": 6.782246879334259e-06, + "loss": 0.6361, + "step": 489 + }, + { + "epoch": 0.33980582524271846, + "grad_norm": 0.3648031387476405, + "learning_rate": 6.79611650485437e-06, + "loss": 0.6393, + "step": 490 + }, + { + "epoch": 0.340499306518724, + "grad_norm": 0.3823274123017525, + "learning_rate": 6.80998613037448e-06, + "loss": 0.6614, + "step": 491 + }, + { + "epoch": 0.34119278779472956, + "grad_norm": 0.36274873250507716, + "learning_rate": 6.823855755894592e-06, + "loss": 0.6473, + "step": 492 + }, + { + "epoch": 0.3418862690707351, + "grad_norm": 0.3318657324302781, + "learning_rate": 6.8377253814147025e-06, + "loss": 0.6305, + "step": 493 + }, + { + "epoch": 0.34257975034674065, + "grad_norm": 0.3737790320555523, + "learning_rate": 6.851595006934814e-06, + "loss": 0.6664, + "step": 494 + }, + { + "epoch": 0.3432732316227462, + "grad_norm": 0.38558426487726943, + "learning_rate": 6.865464632454925e-06, + "loss": 0.638, + "step": 495 + }, + { + "epoch": 0.34396671289875175, + "grad_norm": 0.3506520197301319, + "learning_rate": 6.879334257975035e-06, + "loss": 0.6817, + "step": 496 + }, + { + "epoch": 0.3446601941747573, + "grad_norm": 0.3392313412226792, + "learning_rate": 6.893203883495147e-06, + "loss": 0.5512, + "step": 497 + }, + { + "epoch": 0.34535367545076284, + "grad_norm": 0.43644671710710203, + "learning_rate": 6.9070735090152574e-06, + "loss": 0.6812, + "step": 498 + }, + { + "epoch": 0.3460471567267684, + "grad_norm": 0.37686842175908253, + "learning_rate": 6.920943134535368e-06, + "loss": 0.6731, + "step": 499 + }, + { + "epoch": 0.34674063800277394, + "grad_norm": 0.3750595228107398, + "learning_rate": 6.93481276005548e-06, + "loss": 0.6621, + "step": 500 + }, + { + "epoch": 0.3474341192787795, + "grad_norm": 0.4494243935110913, + "learning_rate": 6.94868238557559e-06, + "loss": 0.6918, + "step": 501 + }, + { + "epoch": 0.34812760055478503, + "grad_norm": 0.37332066624967647, + "learning_rate": 6.962552011095701e-06, + "loss": 0.6737, + "step": 502 + }, + { + "epoch": 0.3488210818307906, + "grad_norm": 0.40971393810899, + "learning_rate": 6.976421636615812e-06, + "loss": 0.6566, + "step": 503 + }, + { + "epoch": 0.34951456310679613, + "grad_norm": 0.3997993832631874, + "learning_rate": 6.990291262135923e-06, + "loss": 0.6523, + "step": 504 + }, + { + "epoch": 0.3502080443828017, + "grad_norm": 0.3368593714753437, + "learning_rate": 7.004160887656034e-06, + "loss": 0.6393, + "step": 505 + }, + { + "epoch": 0.3509015256588072, + "grad_norm": 0.39120053297204876, + "learning_rate": 7.018030513176145e-06, + "loss": 0.6642, + "step": 506 + }, + { + "epoch": 0.35159500693481277, + "grad_norm": 0.40169017314000155, + "learning_rate": 7.031900138696256e-06, + "loss": 0.6212, + "step": 507 + }, + { + "epoch": 0.3522884882108183, + "grad_norm": 0.35031468554255296, + "learning_rate": 7.045769764216367e-06, + "loss": 0.6441, + "step": 508 + }, + { + "epoch": 0.35298196948682387, + "grad_norm": 0.35425545147392234, + "learning_rate": 7.059639389736478e-06, + "loss": 0.5838, + "step": 509 + }, + { + "epoch": 0.3536754507628294, + "grad_norm": 0.349257127183559, + "learning_rate": 7.073509015256589e-06, + "loss": 0.5501, + "step": 510 + }, + { + "epoch": 0.35436893203883496, + "grad_norm": 0.4173550075217959, + "learning_rate": 7.0873786407767e-06, + "loss": 0.653, + "step": 511 + }, + { + "epoch": 0.3550624133148405, + "grad_norm": 0.35310911339201523, + "learning_rate": 7.101248266296811e-06, + "loss": 0.6148, + "step": 512 + }, + { + "epoch": 0.35575589459084606, + "grad_norm": 0.3724106105149326, + "learning_rate": 7.115117891816921e-06, + "loss": 0.6372, + "step": 513 + }, + { + "epoch": 0.3564493758668516, + "grad_norm": 0.3746910839190923, + "learning_rate": 7.128987517337033e-06, + "loss": 0.6611, + "step": 514 + }, + { + "epoch": 0.35714285714285715, + "grad_norm": 0.5758133722145943, + "learning_rate": 7.1428571428571436e-06, + "loss": 0.6705, + "step": 515 + }, + { + "epoch": 0.3578363384188627, + "grad_norm": 0.36184393180889357, + "learning_rate": 7.156726768377254e-06, + "loss": 0.6169, + "step": 516 + }, + { + "epoch": 0.35852981969486825, + "grad_norm": 0.3890621880842859, + "learning_rate": 7.170596393897366e-06, + "loss": 0.6163, + "step": 517 + }, + { + "epoch": 0.3592233009708738, + "grad_norm": 0.4163638890746787, + "learning_rate": 7.184466019417476e-06, + "loss": 0.6984, + "step": 518 + }, + { + "epoch": 0.35991678224687934, + "grad_norm": 0.3678646271128726, + "learning_rate": 7.198335644937587e-06, + "loss": 0.5918, + "step": 519 + }, + { + "epoch": 0.3606102635228849, + "grad_norm": 0.33526320878213023, + "learning_rate": 7.2122052704576985e-06, + "loss": 0.5814, + "step": 520 + }, + { + "epoch": 0.36130374479889044, + "grad_norm": 0.3955745590406837, + "learning_rate": 7.226074895977809e-06, + "loss": 0.6317, + "step": 521 + }, + { + "epoch": 0.361997226074896, + "grad_norm": 0.36053174111861014, + "learning_rate": 7.239944521497921e-06, + "loss": 0.6554, + "step": 522 + }, + { + "epoch": 0.36269070735090153, + "grad_norm": 0.3564894704917171, + "learning_rate": 7.253814147018031e-06, + "loss": 0.6125, + "step": 523 + }, + { + "epoch": 0.3633841886269071, + "grad_norm": 0.33316077155749546, + "learning_rate": 7.267683772538142e-06, + "loss": 0.6448, + "step": 524 + }, + { + "epoch": 0.3640776699029126, + "grad_norm": 0.34744648435203684, + "learning_rate": 7.2815533980582534e-06, + "loss": 0.6217, + "step": 525 + }, + { + "epoch": 0.3647711511789182, + "grad_norm": 0.33563035223656806, + "learning_rate": 7.295423023578364e-06, + "loss": 0.5824, + "step": 526 + }, + { + "epoch": 0.3654646324549237, + "grad_norm": 0.3864415802342526, + "learning_rate": 7.309292649098475e-06, + "loss": 0.6363, + "step": 527 + }, + { + "epoch": 0.36615811373092927, + "grad_norm": 0.3716995633111904, + "learning_rate": 7.323162274618586e-06, + "loss": 0.568, + "step": 528 + }, + { + "epoch": 0.3668515950069348, + "grad_norm": 0.330666339054274, + "learning_rate": 7.337031900138697e-06, + "loss": 0.6534, + "step": 529 + }, + { + "epoch": 0.36754507628294036, + "grad_norm": 0.3580854730202074, + "learning_rate": 7.3509015256588075e-06, + "loss": 0.7107, + "step": 530 + }, + { + "epoch": 0.3682385575589459, + "grad_norm": 0.3263473830256869, + "learning_rate": 7.364771151178919e-06, + "loss": 0.6434, + "step": 531 + }, + { + "epoch": 0.36893203883495146, + "grad_norm": 0.4165092917597925, + "learning_rate": 7.37864077669903e-06, + "loss": 0.6386, + "step": 532 + }, + { + "epoch": 0.369625520110957, + "grad_norm": 0.3623391989288637, + "learning_rate": 7.39251040221914e-06, + "loss": 0.7066, + "step": 533 + }, + { + "epoch": 0.37031900138696255, + "grad_norm": 0.4316033729520409, + "learning_rate": 7.406380027739252e-06, + "loss": 0.6199, + "step": 534 + }, + { + "epoch": 0.3710124826629681, + "grad_norm": 0.39532364646227364, + "learning_rate": 7.4202496532593625e-06, + "loss": 0.6207, + "step": 535 + }, + { + "epoch": 0.37170596393897365, + "grad_norm": 0.3705591524094307, + "learning_rate": 7.434119278779474e-06, + "loss": 0.6097, + "step": 536 + }, + { + "epoch": 0.3723994452149792, + "grad_norm": 0.4032004372577973, + "learning_rate": 7.447988904299585e-06, + "loss": 0.6089, + "step": 537 + }, + { + "epoch": 0.37309292649098474, + "grad_norm": 0.35382613007991376, + "learning_rate": 7.461858529819695e-06, + "loss": 0.5554, + "step": 538 + }, + { + "epoch": 0.3737864077669903, + "grad_norm": 0.40564277826124845, + "learning_rate": 7.475728155339807e-06, + "loss": 0.6621, + "step": 539 + }, + { + "epoch": 0.37447988904299584, + "grad_norm": 0.39302305957432315, + "learning_rate": 7.489597780859917e-06, + "loss": 0.6141, + "step": 540 + }, + { + "epoch": 0.3751733703190014, + "grad_norm": 0.4214478728911209, + "learning_rate": 7.503467406380028e-06, + "loss": 0.6199, + "step": 541 + }, + { + "epoch": 0.37586685159500693, + "grad_norm": 0.3628049824721709, + "learning_rate": 7.5173370319001396e-06, + "loss": 0.6865, + "step": 542 + }, + { + "epoch": 0.3765603328710125, + "grad_norm": 0.4012988276021495, + "learning_rate": 7.53120665742025e-06, + "loss": 0.6015, + "step": 543 + }, + { + "epoch": 0.37725381414701803, + "grad_norm": 0.35129563570548594, + "learning_rate": 7.545076282940361e-06, + "loss": 0.6366, + "step": 544 + }, + { + "epoch": 0.3779472954230236, + "grad_norm": 0.3233070882668223, + "learning_rate": 7.558945908460472e-06, + "loss": 0.5471, + "step": 545 + }, + { + "epoch": 0.3786407766990291, + "grad_norm": 0.369766794917211, + "learning_rate": 7.572815533980583e-06, + "loss": 0.6021, + "step": 546 + }, + { + "epoch": 0.37933425797503467, + "grad_norm": 0.7027124293210516, + "learning_rate": 7.586685159500694e-06, + "loss": 0.5974, + "step": 547 + }, + { + "epoch": 0.3800277392510402, + "grad_norm": 0.3508078980491497, + "learning_rate": 7.600554785020805e-06, + "loss": 0.6159, + "step": 548 + }, + { + "epoch": 0.38072122052704577, + "grad_norm": 0.3447122332376924, + "learning_rate": 7.614424410540916e-06, + "loss": 0.625, + "step": 549 + }, + { + "epoch": 0.3814147018030513, + "grad_norm": 0.36742652699902334, + "learning_rate": 7.628294036061027e-06, + "loss": 0.6799, + "step": 550 + }, + { + "epoch": 0.38210818307905686, + "grad_norm": 0.39029782442341576, + "learning_rate": 7.642163661581138e-06, + "loss": 0.6576, + "step": 551 + }, + { + "epoch": 0.3828016643550624, + "grad_norm": 0.3694327119768504, + "learning_rate": 7.65603328710125e-06, + "loss": 0.595, + "step": 552 + }, + { + "epoch": 0.38349514563106796, + "grad_norm": 0.38181457868583935, + "learning_rate": 7.66990291262136e-06, + "loss": 0.6731, + "step": 553 + }, + { + "epoch": 0.3841886269070735, + "grad_norm": 0.3585947759237626, + "learning_rate": 7.68377253814147e-06, + "loss": 0.5964, + "step": 554 + }, + { + "epoch": 0.38488210818307905, + "grad_norm": 0.3711436288734461, + "learning_rate": 7.697642163661582e-06, + "loss": 0.6104, + "step": 555 + }, + { + "epoch": 0.3855755894590846, + "grad_norm": 0.32718425674116086, + "learning_rate": 7.711511789181692e-06, + "loss": 0.6054, + "step": 556 + }, + { + "epoch": 0.38626907073509015, + "grad_norm": 0.40139732659377303, + "learning_rate": 7.725381414701804e-06, + "loss": 0.6481, + "step": 557 + }, + { + "epoch": 0.3869625520110957, + "grad_norm": 0.3373681258759682, + "learning_rate": 7.739251040221915e-06, + "loss": 0.6284, + "step": 558 + }, + { + "epoch": 0.38765603328710124, + "grad_norm": 0.34892409173412364, + "learning_rate": 7.753120665742025e-06, + "loss": 0.6358, + "step": 559 + }, + { + "epoch": 0.3883495145631068, + "grad_norm": 0.32381651149847956, + "learning_rate": 7.766990291262136e-06, + "loss": 0.614, + "step": 560 + }, + { + "epoch": 0.38904299583911234, + "grad_norm": 0.5387017088611541, + "learning_rate": 7.780859916782248e-06, + "loss": 0.658, + "step": 561 + }, + { + "epoch": 0.3897364771151179, + "grad_norm": 0.3644482405436353, + "learning_rate": 7.794729542302358e-06, + "loss": 0.567, + "step": 562 + }, + { + "epoch": 0.39042995839112343, + "grad_norm": 0.3563311439881521, + "learning_rate": 7.808599167822469e-06, + "loss": 0.6288, + "step": 563 + }, + { + "epoch": 0.391123439667129, + "grad_norm": 0.3647926377245987, + "learning_rate": 7.82246879334258e-06, + "loss": 0.6083, + "step": 564 + }, + { + "epoch": 0.39181692094313453, + "grad_norm": 0.38784353311841147, + "learning_rate": 7.83633841886269e-06, + "loss": 0.6638, + "step": 565 + }, + { + "epoch": 0.3925104022191401, + "grad_norm": 0.3627516423040043, + "learning_rate": 7.850208044382802e-06, + "loss": 0.629, + "step": 566 + }, + { + "epoch": 0.3932038834951456, + "grad_norm": 0.3709618601859805, + "learning_rate": 7.864077669902913e-06, + "loss": 0.652, + "step": 567 + }, + { + "epoch": 0.39389736477115117, + "grad_norm": 0.367595653098122, + "learning_rate": 7.877947295423023e-06, + "loss": 0.6287, + "step": 568 + }, + { + "epoch": 0.3945908460471567, + "grad_norm": 0.37530796936263455, + "learning_rate": 7.891816920943135e-06, + "loss": 0.6012, + "step": 569 + }, + { + "epoch": 0.39528432732316227, + "grad_norm": 0.3630061585167657, + "learning_rate": 7.905686546463246e-06, + "loss": 0.6399, + "step": 570 + }, + { + "epoch": 0.3959778085991678, + "grad_norm": 0.4505300922846165, + "learning_rate": 7.919556171983358e-06, + "loss": 0.5954, + "step": 571 + }, + { + "epoch": 0.39667128987517336, + "grad_norm": 0.5241355468709938, + "learning_rate": 7.933425797503468e-06, + "loss": 0.5999, + "step": 572 + }, + { + "epoch": 0.3973647711511789, + "grad_norm": 0.6596590261106783, + "learning_rate": 7.947295423023579e-06, + "loss": 0.6565, + "step": 573 + }, + { + "epoch": 0.39805825242718446, + "grad_norm": 0.3421029721387191, + "learning_rate": 7.96116504854369e-06, + "loss": 0.6338, + "step": 574 + }, + { + "epoch": 0.39875173370319, + "grad_norm": 0.3818309929008366, + "learning_rate": 7.9750346740638e-06, + "loss": 0.5922, + "step": 575 + }, + { + "epoch": 0.39944521497919555, + "grad_norm": 0.3370977908591438, + "learning_rate": 7.988904299583912e-06, + "loss": 0.5702, + "step": 576 + }, + { + "epoch": 0.4001386962552011, + "grad_norm": 0.4148378425411659, + "learning_rate": 8.002773925104023e-06, + "loss": 0.6824, + "step": 577 + }, + { + "epoch": 0.40083217753120665, + "grad_norm": 0.3559257403836171, + "learning_rate": 8.016643550624133e-06, + "loss": 0.5815, + "step": 578 + }, + { + "epoch": 0.4015256588072122, + "grad_norm": 0.390233012997757, + "learning_rate": 8.030513176144245e-06, + "loss": 0.6534, + "step": 579 + }, + { + "epoch": 0.40221914008321774, + "grad_norm": 0.35742287491923425, + "learning_rate": 8.044382801664356e-06, + "loss": 0.6017, + "step": 580 + }, + { + "epoch": 0.4029126213592233, + "grad_norm": 0.37159797793736915, + "learning_rate": 8.058252427184466e-06, + "loss": 0.6613, + "step": 581 + }, + { + "epoch": 0.40360610263522884, + "grad_norm": 0.3610419889811722, + "learning_rate": 8.072122052704577e-06, + "loss": 0.606, + "step": 582 + }, + { + "epoch": 0.4042995839112344, + "grad_norm": 0.3213564416582794, + "learning_rate": 8.085991678224689e-06, + "loss": 0.6015, + "step": 583 + }, + { + "epoch": 0.40499306518723993, + "grad_norm": 0.45625536503963865, + "learning_rate": 8.099861303744799e-06, + "loss": 0.6629, + "step": 584 + }, + { + "epoch": 0.4056865464632455, + "grad_norm": 0.3453465437928662, + "learning_rate": 8.11373092926491e-06, + "loss": 0.5674, + "step": 585 + }, + { + "epoch": 0.406380027739251, + "grad_norm": 0.38035315799004166, + "learning_rate": 8.127600554785022e-06, + "loss": 0.584, + "step": 586 + }, + { + "epoch": 0.4070735090152566, + "grad_norm": 0.38026533204245244, + "learning_rate": 8.141470180305131e-06, + "loss": 0.6458, + "step": 587 + }, + { + "epoch": 0.4077669902912621, + "grad_norm": 0.505652936690654, + "learning_rate": 8.155339805825243e-06, + "loss": 0.5777, + "step": 588 + }, + { + "epoch": 0.40846047156726767, + "grad_norm": 0.3630153463847811, + "learning_rate": 8.169209431345354e-06, + "loss": 0.6462, + "step": 589 + }, + { + "epoch": 0.4091539528432732, + "grad_norm": 0.44266297377838537, + "learning_rate": 8.183079056865464e-06, + "loss": 0.5683, + "step": 590 + }, + { + "epoch": 0.40984743411927876, + "grad_norm": 0.3106147483563408, + "learning_rate": 8.196948682385576e-06, + "loss": 0.5945, + "step": 591 + }, + { + "epoch": 0.4105409153952843, + "grad_norm": 0.35094025241047616, + "learning_rate": 8.210818307905687e-06, + "loss": 0.5985, + "step": 592 + }, + { + "epoch": 0.41123439667128986, + "grad_norm": 0.3603207466662012, + "learning_rate": 8.224687933425797e-06, + "loss": 0.633, + "step": 593 + }, + { + "epoch": 0.4119278779472954, + "grad_norm": 0.3955033347968271, + "learning_rate": 8.238557558945909e-06, + "loss": 0.6413, + "step": 594 + }, + { + "epoch": 0.41262135922330095, + "grad_norm": 0.36000565915117894, + "learning_rate": 8.25242718446602e-06, + "loss": 0.6554, + "step": 595 + }, + { + "epoch": 0.4133148404993065, + "grad_norm": 0.34329202409414755, + "learning_rate": 8.266296809986132e-06, + "loss": 0.6067, + "step": 596 + }, + { + "epoch": 0.41400832177531205, + "grad_norm": 0.3128773773505845, + "learning_rate": 8.280166435506241e-06, + "loss": 0.59, + "step": 597 + }, + { + "epoch": 0.4147018030513176, + "grad_norm": 0.4023227741583138, + "learning_rate": 8.294036061026353e-06, + "loss": 0.5882, + "step": 598 + }, + { + "epoch": 0.41539528432732314, + "grad_norm": 0.3891397827002362, + "learning_rate": 8.307905686546464e-06, + "loss": 0.681, + "step": 599 + }, + { + "epoch": 0.4160887656033287, + "grad_norm": 0.3720371127317372, + "learning_rate": 8.321775312066574e-06, + "loss": 0.618, + "step": 600 + }, + { + "epoch": 0.41678224687933424, + "grad_norm": 0.35999608094557894, + "learning_rate": 8.335644937586686e-06, + "loss": 0.7464, + "step": 601 + }, + { + "epoch": 0.4174757281553398, + "grad_norm": 0.3574131985513801, + "learning_rate": 8.349514563106797e-06, + "loss": 0.643, + "step": 602 + }, + { + "epoch": 0.41816920943134533, + "grad_norm": 0.3901567629810148, + "learning_rate": 8.363384188626907e-06, + "loss": 0.6128, + "step": 603 + }, + { + "epoch": 0.4188626907073509, + "grad_norm": 0.3121366927797027, + "learning_rate": 8.377253814147018e-06, + "loss": 0.5881, + "step": 604 + }, + { + "epoch": 0.41955617198335643, + "grad_norm": 7.777784187646424, + "learning_rate": 8.39112343966713e-06, + "loss": 0.6798, + "step": 605 + }, + { + "epoch": 0.420249653259362, + "grad_norm": 0.37283542003408443, + "learning_rate": 8.40499306518724e-06, + "loss": 0.5584, + "step": 606 + }, + { + "epoch": 0.4209431345353675, + "grad_norm": 0.4133622213604055, + "learning_rate": 8.418862690707351e-06, + "loss": 0.6643, + "step": 607 + }, + { + "epoch": 0.42163661581137307, + "grad_norm": 0.3683511948530525, + "learning_rate": 8.432732316227463e-06, + "loss": 0.6531, + "step": 608 + }, + { + "epoch": 0.4223300970873786, + "grad_norm": 0.3424953294175224, + "learning_rate": 8.446601941747573e-06, + "loss": 0.6101, + "step": 609 + }, + { + "epoch": 0.42302357836338417, + "grad_norm": 0.33224537868688725, + "learning_rate": 8.460471567267684e-06, + "loss": 0.6157, + "step": 610 + }, + { + "epoch": 0.4237170596393897, + "grad_norm": 0.36205005650227956, + "learning_rate": 8.474341192787796e-06, + "loss": 0.5699, + "step": 611 + }, + { + "epoch": 0.42441054091539526, + "grad_norm": 0.3583343067676532, + "learning_rate": 8.488210818307905e-06, + "loss": 0.6227, + "step": 612 + }, + { + "epoch": 0.4251040221914008, + "grad_norm": 0.34724699542050486, + "learning_rate": 8.502080443828017e-06, + "loss": 0.618, + "step": 613 + }, + { + "epoch": 0.42579750346740636, + "grad_norm": 0.3588354342662689, + "learning_rate": 8.515950069348128e-06, + "loss": 0.6174, + "step": 614 + }, + { + "epoch": 0.4264909847434119, + "grad_norm": 0.3275400430150551, + "learning_rate": 8.529819694868238e-06, + "loss": 0.5574, + "step": 615 + }, + { + "epoch": 0.42718446601941745, + "grad_norm": 0.3925014990832987, + "learning_rate": 8.54368932038835e-06, + "loss": 0.5617, + "step": 616 + }, + { + "epoch": 0.427877947295423, + "grad_norm": 0.37400960767024355, + "learning_rate": 8.557558945908461e-06, + "loss": 0.644, + "step": 617 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 0.33813596564574294, + "learning_rate": 8.571428571428571e-06, + "loss": 0.55, + "step": 618 + }, + { + "epoch": 0.4292649098474341, + "grad_norm": 0.32360010608792594, + "learning_rate": 8.585298196948682e-06, + "loss": 0.6387, + "step": 619 + }, + { + "epoch": 0.42995839112343964, + "grad_norm": 0.3525534513240279, + "learning_rate": 8.599167822468794e-06, + "loss": 0.6646, + "step": 620 + }, + { + "epoch": 0.4306518723994452, + "grad_norm": 0.41679474612151624, + "learning_rate": 8.613037447988904e-06, + "loss": 0.636, + "step": 621 + }, + { + "epoch": 0.43134535367545074, + "grad_norm": 0.3684984554461236, + "learning_rate": 8.626907073509015e-06, + "loss": 0.6316, + "step": 622 + }, + { + "epoch": 0.4320388349514563, + "grad_norm": 0.3168038897942362, + "learning_rate": 8.640776699029127e-06, + "loss": 0.5389, + "step": 623 + }, + { + "epoch": 0.43273231622746183, + "grad_norm": 0.3490477207986355, + "learning_rate": 8.654646324549238e-06, + "loss": 0.6288, + "step": 624 + }, + { + "epoch": 0.4334257975034674, + "grad_norm": 0.305919050156397, + "learning_rate": 8.668515950069348e-06, + "loss": 0.5392, + "step": 625 + }, + { + "epoch": 0.43411927877947293, + "grad_norm": 0.3309731211114467, + "learning_rate": 8.68238557558946e-06, + "loss": 0.6231, + "step": 626 + }, + { + "epoch": 0.4348127600554785, + "grad_norm": 0.36591059473372156, + "learning_rate": 8.696255201109571e-06, + "loss": 0.6402, + "step": 627 + }, + { + "epoch": 0.435506241331484, + "grad_norm": 0.38059866948975574, + "learning_rate": 8.71012482662968e-06, + "loss": 0.6703, + "step": 628 + }, + { + "epoch": 0.43619972260748957, + "grad_norm": 0.3523777807232137, + "learning_rate": 8.723994452149792e-06, + "loss": 0.5613, + "step": 629 + }, + { + "epoch": 0.4368932038834951, + "grad_norm": 0.5448638090461266, + "learning_rate": 8.737864077669904e-06, + "loss": 0.6361, + "step": 630 + }, + { + "epoch": 0.4375866851595007, + "grad_norm": 0.3586127060142806, + "learning_rate": 8.751733703190015e-06, + "loss": 0.5749, + "step": 631 + }, + { + "epoch": 0.43828016643550627, + "grad_norm": 0.3747722318686593, + "learning_rate": 8.765603328710127e-06, + "loss": 0.6181, + "step": 632 + }, + { + "epoch": 0.4389736477115118, + "grad_norm": 0.3920628828663998, + "learning_rate": 8.779472954230237e-06, + "loss": 0.6673, + "step": 633 + }, + { + "epoch": 0.43966712898751736, + "grad_norm": 0.37818417962654277, + "learning_rate": 8.793342579750348e-06, + "loss": 0.6165, + "step": 634 + }, + { + "epoch": 0.4403606102635229, + "grad_norm": 0.349001623054722, + "learning_rate": 8.80721220527046e-06, + "loss": 0.578, + "step": 635 + }, + { + "epoch": 0.44105409153952846, + "grad_norm": 0.3473644950842794, + "learning_rate": 8.82108183079057e-06, + "loss": 0.6422, + "step": 636 + }, + { + "epoch": 0.441747572815534, + "grad_norm": 0.3842969319738716, + "learning_rate": 8.834951456310681e-06, + "loss": 0.6711, + "step": 637 + }, + { + "epoch": 0.44244105409153955, + "grad_norm": 0.37756291569586337, + "learning_rate": 8.848821081830792e-06, + "loss": 0.6555, + "step": 638 + }, + { + "epoch": 0.4431345353675451, + "grad_norm": 0.40117967766113527, + "learning_rate": 8.862690707350902e-06, + "loss": 0.6397, + "step": 639 + }, + { + "epoch": 0.44382801664355065, + "grad_norm": 0.3405515190628938, + "learning_rate": 8.876560332871014e-06, + "loss": 0.6025, + "step": 640 + }, + { + "epoch": 0.4445214979195562, + "grad_norm": 0.3725640263256581, + "learning_rate": 8.890429958391125e-06, + "loss": 0.5874, + "step": 641 + }, + { + "epoch": 0.44521497919556174, + "grad_norm": 0.3428200244683828, + "learning_rate": 8.904299583911235e-06, + "loss": 0.5893, + "step": 642 + }, + { + "epoch": 0.4459084604715673, + "grad_norm": 0.33252775268279633, + "learning_rate": 8.918169209431346e-06, + "loss": 0.6661, + "step": 643 + }, + { + "epoch": 0.44660194174757284, + "grad_norm": 0.3830520211600166, + "learning_rate": 8.932038834951458e-06, + "loss": 0.5683, + "step": 644 + }, + { + "epoch": 0.4472954230235784, + "grad_norm": 0.3715958039866768, + "learning_rate": 8.945908460471568e-06, + "loss": 0.6138, + "step": 645 + }, + { + "epoch": 0.44798890429958393, + "grad_norm": 0.35265837994315496, + "learning_rate": 8.95977808599168e-06, + "loss": 0.5994, + "step": 646 + }, + { + "epoch": 0.4486823855755895, + "grad_norm": 0.33039197619618926, + "learning_rate": 8.97364771151179e-06, + "loss": 0.6264, + "step": 647 + }, + { + "epoch": 0.44937586685159503, + "grad_norm": 0.384641530384446, + "learning_rate": 8.9875173370319e-06, + "loss": 0.642, + "step": 648 + }, + { + "epoch": 0.4500693481276006, + "grad_norm": 0.3607597153002498, + "learning_rate": 9.001386962552012e-06, + "loss": 0.6395, + "step": 649 + }, + { + "epoch": 0.4507628294036061, + "grad_norm": 0.35606092385216564, + "learning_rate": 9.015256588072124e-06, + "loss": 0.6401, + "step": 650 + }, + { + "epoch": 0.45145631067961167, + "grad_norm": 0.4090353440808489, + "learning_rate": 9.029126213592233e-06, + "loss": 0.5907, + "step": 651 + }, + { + "epoch": 0.4521497919556172, + "grad_norm": 0.37308131460336813, + "learning_rate": 9.042995839112345e-06, + "loss": 0.633, + "step": 652 + }, + { + "epoch": 0.45284327323162277, + "grad_norm": 0.3430242542135983, + "learning_rate": 9.056865464632456e-06, + "loss": 0.6001, + "step": 653 + }, + { + "epoch": 0.4535367545076283, + "grad_norm": 0.36017796963933435, + "learning_rate": 9.070735090152568e-06, + "loss": 0.6468, + "step": 654 + }, + { + "epoch": 0.45423023578363386, + "grad_norm": 0.4294104559681809, + "learning_rate": 9.084604715672678e-06, + "loss": 0.6611, + "step": 655 + }, + { + "epoch": 0.4549237170596394, + "grad_norm": 0.37074717444942756, + "learning_rate": 9.098474341192789e-06, + "loss": 0.6274, + "step": 656 + }, + { + "epoch": 0.45561719833564496, + "grad_norm": 0.34001423039867223, + "learning_rate": 9.1123439667129e-06, + "loss": 0.5995, + "step": 657 + }, + { + "epoch": 0.4563106796116505, + "grad_norm": 0.3601282007392672, + "learning_rate": 9.12621359223301e-06, + "loss": 0.6339, + "step": 658 + }, + { + "epoch": 0.45700416088765605, + "grad_norm": 0.5888944383430742, + "learning_rate": 9.140083217753122e-06, + "loss": 0.6437, + "step": 659 + }, + { + "epoch": 0.4576976421636616, + "grad_norm": 0.3412602724562163, + "learning_rate": 9.153952843273233e-06, + "loss": 0.599, + "step": 660 + }, + { + "epoch": 0.45839112343966715, + "grad_norm": 0.37562759117690714, + "learning_rate": 9.167822468793343e-06, + "loss": 0.6625, + "step": 661 + }, + { + "epoch": 0.4590846047156727, + "grad_norm": 0.31646784457755045, + "learning_rate": 9.181692094313455e-06, + "loss": 0.6401, + "step": 662 + }, + { + "epoch": 0.45977808599167824, + "grad_norm": 0.3100306138668391, + "learning_rate": 9.195561719833566e-06, + "loss": 0.6041, + "step": 663 + }, + { + "epoch": 0.4604715672676838, + "grad_norm": 0.35226961678368174, + "learning_rate": 9.209431345353676e-06, + "loss": 0.5334, + "step": 664 + }, + { + "epoch": 0.46116504854368934, + "grad_norm": 0.3410210807636282, + "learning_rate": 9.223300970873788e-06, + "loss": 0.5999, + "step": 665 + }, + { + "epoch": 0.4618585298196949, + "grad_norm": 0.3801675232094619, + "learning_rate": 9.237170596393899e-06, + "loss": 0.6375, + "step": 666 + }, + { + "epoch": 0.46255201109570043, + "grad_norm": 0.364125012519612, + "learning_rate": 9.251040221914009e-06, + "loss": 0.6122, + "step": 667 + }, + { + "epoch": 0.463245492371706, + "grad_norm": 0.3461524800556262, + "learning_rate": 9.26490984743412e-06, + "loss": 0.6194, + "step": 668 + }, + { + "epoch": 0.46393897364771153, + "grad_norm": 0.3660251738461281, + "learning_rate": 9.278779472954232e-06, + "loss": 0.6357, + "step": 669 + }, + { + "epoch": 0.4646324549237171, + "grad_norm": 0.3402496059048011, + "learning_rate": 9.292649098474342e-06, + "loss": 0.643, + "step": 670 + }, + { + "epoch": 0.4653259361997226, + "grad_norm": 0.40563360025585726, + "learning_rate": 9.306518723994453e-06, + "loss": 0.5717, + "step": 671 + }, + { + "epoch": 0.46601941747572817, + "grad_norm": 0.36925076136939955, + "learning_rate": 9.320388349514565e-06, + "loss": 0.6217, + "step": 672 + }, + { + "epoch": 0.4667128987517337, + "grad_norm": 0.3478622265674262, + "learning_rate": 9.334257975034674e-06, + "loss": 0.581, + "step": 673 + }, + { + "epoch": 0.46740638002773927, + "grad_norm": 0.3317012135835466, + "learning_rate": 9.348127600554786e-06, + "loss": 0.5265, + "step": 674 + }, + { + "epoch": 0.4680998613037448, + "grad_norm": 0.36481458204878303, + "learning_rate": 9.361997226074897e-06, + "loss": 0.6234, + "step": 675 + }, + { + "epoch": 0.46879334257975036, + "grad_norm": 0.3447916072991981, + "learning_rate": 9.375866851595007e-06, + "loss": 0.5516, + "step": 676 + }, + { + "epoch": 0.4694868238557559, + "grad_norm": 0.3133773887900021, + "learning_rate": 9.389736477115119e-06, + "loss": 0.5596, + "step": 677 + }, + { + "epoch": 0.47018030513176146, + "grad_norm": 0.32308806295093884, + "learning_rate": 9.40360610263523e-06, + "loss": 0.5802, + "step": 678 + }, + { + "epoch": 0.470873786407767, + "grad_norm": 0.3481099319108013, + "learning_rate": 9.41747572815534e-06, + "loss": 0.669, + "step": 679 + }, + { + "epoch": 0.47156726768377255, + "grad_norm": 0.4351725032471424, + "learning_rate": 9.431345353675451e-06, + "loss": 0.7111, + "step": 680 + }, + { + "epoch": 0.4722607489597781, + "grad_norm": 0.36114338628781123, + "learning_rate": 9.445214979195563e-06, + "loss": 0.6631, + "step": 681 + }, + { + "epoch": 0.47295423023578365, + "grad_norm": 0.3800252832289555, + "learning_rate": 9.459084604715674e-06, + "loss": 0.6906, + "step": 682 + }, + { + "epoch": 0.4736477115117892, + "grad_norm": 0.38194549464555544, + "learning_rate": 9.472954230235784e-06, + "loss": 0.603, + "step": 683 + }, + { + "epoch": 0.47434119278779474, + "grad_norm": 0.36617573291203553, + "learning_rate": 9.486823855755896e-06, + "loss": 0.6271, + "step": 684 + }, + { + "epoch": 0.4750346740638003, + "grad_norm": 0.3264948505149436, + "learning_rate": 9.500693481276007e-06, + "loss": 0.5752, + "step": 685 + }, + { + "epoch": 0.47572815533980584, + "grad_norm": 0.3501432474989948, + "learning_rate": 9.514563106796117e-06, + "loss": 0.6722, + "step": 686 + }, + { + "epoch": 0.4764216366158114, + "grad_norm": 0.35169803665934024, + "learning_rate": 9.528432732316229e-06, + "loss": 0.5869, + "step": 687 + }, + { + "epoch": 0.47711511789181693, + "grad_norm": 0.31858527501959116, + "learning_rate": 9.54230235783634e-06, + "loss": 0.5992, + "step": 688 + }, + { + "epoch": 0.4778085991678225, + "grad_norm": 0.3300810353931968, + "learning_rate": 9.55617198335645e-06, + "loss": 0.5881, + "step": 689 + }, + { + "epoch": 0.478502080443828, + "grad_norm": 0.38880033996386565, + "learning_rate": 9.570041608876561e-06, + "loss": 0.6189, + "step": 690 + }, + { + "epoch": 0.4791955617198336, + "grad_norm": 0.36418060142418335, + "learning_rate": 9.583911234396673e-06, + "loss": 0.5857, + "step": 691 + }, + { + "epoch": 0.4798890429958391, + "grad_norm": 0.3540058535907166, + "learning_rate": 9.597780859916783e-06, + "loss": 0.5957, + "step": 692 + }, + { + "epoch": 0.48058252427184467, + "grad_norm": 0.31340466492363656, + "learning_rate": 9.611650485436894e-06, + "loss": 0.6161, + "step": 693 + }, + { + "epoch": 0.4812760055478502, + "grad_norm": 0.4557610208993212, + "learning_rate": 9.625520110957006e-06, + "loss": 0.7067, + "step": 694 + }, + { + "epoch": 0.48196948682385576, + "grad_norm": 0.38295004302091745, + "learning_rate": 9.639389736477115e-06, + "loss": 0.5717, + "step": 695 + }, + { + "epoch": 0.4826629680998613, + "grad_norm": 0.37190722678317617, + "learning_rate": 9.653259361997227e-06, + "loss": 0.6014, + "step": 696 + }, + { + "epoch": 0.48335644937586686, + "grad_norm": 0.36611389398482685, + "learning_rate": 9.667128987517338e-06, + "loss": 0.6137, + "step": 697 + }, + { + "epoch": 0.4840499306518724, + "grad_norm": 0.32538737533209905, + "learning_rate": 9.680998613037448e-06, + "loss": 0.5112, + "step": 698 + }, + { + "epoch": 0.48474341192787795, + "grad_norm": 0.3463470381288639, + "learning_rate": 9.69486823855756e-06, + "loss": 0.6678, + "step": 699 + }, + { + "epoch": 0.4854368932038835, + "grad_norm": 0.3824644240082401, + "learning_rate": 9.708737864077671e-06, + "loss": 0.7011, + "step": 700 + }, + { + "epoch": 0.48613037447988905, + "grad_norm": 0.34319878679914073, + "learning_rate": 9.722607489597781e-06, + "loss": 0.6133, + "step": 701 + }, + { + "epoch": 0.4868238557558946, + "grad_norm": 0.33961240170897133, + "learning_rate": 9.736477115117893e-06, + "loss": 0.617, + "step": 702 + }, + { + "epoch": 0.48751733703190014, + "grad_norm": 0.36115468346237317, + "learning_rate": 9.750346740638004e-06, + "loss": 0.5648, + "step": 703 + }, + { + "epoch": 0.4882108183079057, + "grad_norm": 0.34936958202921586, + "learning_rate": 9.764216366158114e-06, + "loss": 0.6263, + "step": 704 + }, + { + "epoch": 0.48890429958391124, + "grad_norm": 0.4050860259334066, + "learning_rate": 9.778085991678225e-06, + "loss": 0.6111, + "step": 705 + }, + { + "epoch": 0.4895977808599168, + "grad_norm": 0.32637207500367693, + "learning_rate": 9.791955617198337e-06, + "loss": 0.5298, + "step": 706 + }, + { + "epoch": 0.49029126213592233, + "grad_norm": 0.32587264169952956, + "learning_rate": 9.805825242718447e-06, + "loss": 0.5618, + "step": 707 + }, + { + "epoch": 0.4909847434119279, + "grad_norm": 0.34778623114061646, + "learning_rate": 9.819694868238558e-06, + "loss": 0.6204, + "step": 708 + }, + { + "epoch": 0.49167822468793343, + "grad_norm": 0.36103341538436834, + "learning_rate": 9.83356449375867e-06, + "loss": 0.6517, + "step": 709 + }, + { + "epoch": 0.492371705963939, + "grad_norm": 0.33128531808851164, + "learning_rate": 9.847434119278781e-06, + "loss": 0.5713, + "step": 710 + }, + { + "epoch": 0.4930651872399445, + "grad_norm": 0.3784980712716657, + "learning_rate": 9.861303744798891e-06, + "loss": 0.6977, + "step": 711 + }, + { + "epoch": 0.49375866851595007, + "grad_norm": 0.3402060319209778, + "learning_rate": 9.875173370319002e-06, + "loss": 0.6149, + "step": 712 + }, + { + "epoch": 0.4944521497919556, + "grad_norm": 0.3382292644840488, + "learning_rate": 9.889042995839114e-06, + "loss": 0.5593, + "step": 713 + }, + { + "epoch": 0.49514563106796117, + "grad_norm": 0.33045995267051376, + "learning_rate": 9.902912621359224e-06, + "loss": 0.595, + "step": 714 + }, + { + "epoch": 0.4958391123439667, + "grad_norm": 0.3522171989161278, + "learning_rate": 9.916782246879335e-06, + "loss": 0.6228, + "step": 715 + }, + { + "epoch": 0.49653259361997226, + "grad_norm": 0.3477928080599467, + "learning_rate": 9.930651872399447e-06, + "loss": 0.6339, + "step": 716 + }, + { + "epoch": 0.4972260748959778, + "grad_norm": 0.38447654259666586, + "learning_rate": 9.944521497919557e-06, + "loss": 0.6673, + "step": 717 + }, + { + "epoch": 0.49791955617198336, + "grad_norm": 0.32543806149067384, + "learning_rate": 9.958391123439668e-06, + "loss": 0.624, + "step": 718 + }, + { + "epoch": 0.4986130374479889, + "grad_norm": 0.36357628229093936, + "learning_rate": 9.97226074895978e-06, + "loss": 0.6236, + "step": 719 + }, + { + "epoch": 0.49930651872399445, + "grad_norm": 0.3518223270174587, + "learning_rate": 9.98613037447989e-06, + "loss": 0.6528, + "step": 720 + }, + { + "epoch": 0.5, + "grad_norm": 0.30952998333257387, + "learning_rate": 1e-05, + "loss": 0.54, + "step": 721 + }, + { + "epoch": 0.5006934812760055, + "grad_norm": 0.41501604220152166, + "learning_rate": 9.999999414018107e-06, + "loss": 0.5692, + "step": 722 + }, + { + "epoch": 0.5013869625520111, + "grad_norm": 0.32514562243010514, + "learning_rate": 9.999997656072562e-06, + "loss": 0.5872, + "step": 723 + }, + { + "epoch": 0.5020804438280166, + "grad_norm": 0.3470627802984313, + "learning_rate": 9.999994726163778e-06, + "loss": 0.6359, + "step": 724 + }, + { + "epoch": 0.5027739251040222, + "grad_norm": 0.3745973618825542, + "learning_rate": 9.999990624292442e-06, + "loss": 0.647, + "step": 725 + }, + { + "epoch": 0.5034674063800277, + "grad_norm": 0.34450891072001916, + "learning_rate": 9.999985350459514e-06, + "loss": 0.6046, + "step": 726 + }, + { + "epoch": 0.5041608876560333, + "grad_norm": 0.33360430874820624, + "learning_rate": 9.999978904666233e-06, + "loss": 0.5942, + "step": 727 + }, + { + "epoch": 0.5048543689320388, + "grad_norm": 0.33409116101469366, + "learning_rate": 9.999971286914108e-06, + "loss": 0.5498, + "step": 728 + }, + { + "epoch": 0.5055478502080444, + "grad_norm": 0.33690900277471875, + "learning_rate": 9.999962497204925e-06, + "loss": 0.6787, + "step": 729 + }, + { + "epoch": 0.5062413314840499, + "grad_norm": 0.3299105111409337, + "learning_rate": 9.999952535540743e-06, + "loss": 0.5169, + "step": 730 + }, + { + "epoch": 0.5069348127600555, + "grad_norm": 0.35733261526077775, + "learning_rate": 9.999941401923899e-06, + "loss": 0.5994, + "step": 731 + }, + { + "epoch": 0.507628294036061, + "grad_norm": 0.37167083547332896, + "learning_rate": 9.999929096357001e-06, + "loss": 0.5934, + "step": 732 + }, + { + "epoch": 0.5083217753120666, + "grad_norm": 0.3410947793093965, + "learning_rate": 9.999915618842935e-06, + "loss": 0.6686, + "step": 733 + }, + { + "epoch": 0.5090152565880721, + "grad_norm": 0.35813602667713473, + "learning_rate": 9.99990096938486e-06, + "loss": 0.5926, + "step": 734 + }, + { + "epoch": 0.5097087378640777, + "grad_norm": 0.34850411155772887, + "learning_rate": 9.999885147986207e-06, + "loss": 0.6172, + "step": 735 + }, + { + "epoch": 0.5104022191400832, + "grad_norm": 0.3997462176758978, + "learning_rate": 9.999868154650686e-06, + "loss": 0.6658, + "step": 736 + }, + { + "epoch": 0.5110957004160888, + "grad_norm": 0.3581097909925207, + "learning_rate": 9.99984998938228e-06, + "loss": 0.5888, + "step": 737 + }, + { + "epoch": 0.5117891816920943, + "grad_norm": 0.328277957660063, + "learning_rate": 9.999830652185248e-06, + "loss": 0.5651, + "step": 738 + }, + { + "epoch": 0.5124826629680999, + "grad_norm": 0.46818075793680175, + "learning_rate": 9.999810143064122e-06, + "loss": 0.6999, + "step": 739 + }, + { + "epoch": 0.5131761442441054, + "grad_norm": 0.3522552661114412, + "learning_rate": 9.999788462023707e-06, + "loss": 0.6181, + "step": 740 + }, + { + "epoch": 0.513869625520111, + "grad_norm": 0.33841693670191736, + "learning_rate": 9.99976560906909e-06, + "loss": 0.5506, + "step": 741 + }, + { + "epoch": 0.5145631067961165, + "grad_norm": 0.32448542256857915, + "learning_rate": 9.999741584205621e-06, + "loss": 0.5528, + "step": 742 + }, + { + "epoch": 0.515256588072122, + "grad_norm": 0.4015374961363589, + "learning_rate": 9.999716387438935e-06, + "loss": 0.6397, + "step": 743 + }, + { + "epoch": 0.5159500693481276, + "grad_norm": 0.35333372732488916, + "learning_rate": 9.999690018774939e-06, + "loss": 0.6324, + "step": 744 + }, + { + "epoch": 0.5166435506241331, + "grad_norm": 0.3118523478259235, + "learning_rate": 9.99966247821981e-06, + "loss": 0.6094, + "step": 745 + }, + { + "epoch": 0.5173370319001387, + "grad_norm": 0.3623486446023633, + "learning_rate": 9.999633765780008e-06, + "loss": 0.5759, + "step": 746 + }, + { + "epoch": 0.5180305131761442, + "grad_norm": 0.35080946017865067, + "learning_rate": 9.999603881462258e-06, + "loss": 0.6108, + "step": 747 + }, + { + "epoch": 0.5187239944521498, + "grad_norm": 0.34531983652521164, + "learning_rate": 9.999572825273569e-06, + "loss": 0.6456, + "step": 748 + }, + { + "epoch": 0.5194174757281553, + "grad_norm": 0.3568512706854189, + "learning_rate": 9.999540597221217e-06, + "loss": 0.591, + "step": 749 + }, + { + "epoch": 0.5201109570041609, + "grad_norm": 0.3872941250866843, + "learning_rate": 9.999507197312756e-06, + "loss": 0.6462, + "step": 750 + }, + { + "epoch": 0.5208044382801664, + "grad_norm": 0.36623504971829873, + "learning_rate": 9.999472625556019e-06, + "loss": 0.6365, + "step": 751 + }, + { + "epoch": 0.521497919556172, + "grad_norm": 0.3594091145576425, + "learning_rate": 9.999436881959105e-06, + "loss": 0.5919, + "step": 752 + }, + { + "epoch": 0.5221914008321775, + "grad_norm": 0.3270076408023314, + "learning_rate": 9.999399966530394e-06, + "loss": 0.56, + "step": 753 + }, + { + "epoch": 0.5228848821081831, + "grad_norm": 0.34133621043017787, + "learning_rate": 9.999361879278537e-06, + "loss": 0.5661, + "step": 754 + }, + { + "epoch": 0.5235783633841886, + "grad_norm": 0.37079359026140135, + "learning_rate": 9.999322620212463e-06, + "loss": 0.5884, + "step": 755 + }, + { + "epoch": 0.5242718446601942, + "grad_norm": 0.3413876553175361, + "learning_rate": 9.999282189341374e-06, + "loss": 0.6294, + "step": 756 + }, + { + "epoch": 0.5249653259361997, + "grad_norm": 0.32037776440650106, + "learning_rate": 9.999240586674749e-06, + "loss": 0.5893, + "step": 757 + }, + { + "epoch": 0.5256588072122053, + "grad_norm": 0.372701778382164, + "learning_rate": 9.999197812222332e-06, + "loss": 0.6726, + "step": 758 + }, + { + "epoch": 0.5263522884882108, + "grad_norm": 0.34301272895701906, + "learning_rate": 9.999153865994156e-06, + "loss": 0.5923, + "step": 759 + }, + { + "epoch": 0.5270457697642164, + "grad_norm": 0.36779244991125787, + "learning_rate": 9.999108748000519e-06, + "loss": 0.6126, + "step": 760 + }, + { + "epoch": 0.5277392510402219, + "grad_norm": 0.3594133655125482, + "learning_rate": 9.999062458251999e-06, + "loss": 0.5625, + "step": 761 + }, + { + "epoch": 0.5284327323162274, + "grad_norm": 0.3760775731146585, + "learning_rate": 9.99901499675944e-06, + "loss": 0.5998, + "step": 762 + }, + { + "epoch": 0.529126213592233, + "grad_norm": 0.3508720821753457, + "learning_rate": 9.998966363533972e-06, + "loss": 0.6119, + "step": 763 + }, + { + "epoch": 0.5298196948682385, + "grad_norm": 0.3121906419870047, + "learning_rate": 9.998916558586992e-06, + "loss": 0.5977, + "step": 764 + }, + { + "epoch": 0.5305131761442441, + "grad_norm": 0.31578410096983867, + "learning_rate": 9.998865581930176e-06, + "loss": 0.5755, + "step": 765 + }, + { + "epoch": 0.5312066574202496, + "grad_norm": 0.34650022010727266, + "learning_rate": 9.99881343357547e-06, + "loss": 0.6215, + "step": 766 + }, + { + "epoch": 0.5319001386962552, + "grad_norm": 0.3871096605321635, + "learning_rate": 9.998760113535097e-06, + "loss": 0.6004, + "step": 767 + }, + { + "epoch": 0.5325936199722607, + "grad_norm": 0.35318927332741834, + "learning_rate": 9.998705621821559e-06, + "loss": 0.6363, + "step": 768 + }, + { + "epoch": 0.5332871012482663, + "grad_norm": 0.32787154942007024, + "learning_rate": 9.998649958447624e-06, + "loss": 0.6026, + "step": 769 + }, + { + "epoch": 0.5339805825242718, + "grad_norm": 0.31997094021759737, + "learning_rate": 9.99859312342634e-06, + "loss": 0.5623, + "step": 770 + }, + { + "epoch": 0.5346740638002774, + "grad_norm": 0.34604820086773813, + "learning_rate": 9.99853511677103e-06, + "loss": 0.6085, + "step": 771 + }, + { + "epoch": 0.5353675450762829, + "grad_norm": 0.32553598125538497, + "learning_rate": 9.99847593849529e-06, + "loss": 0.5923, + "step": 772 + }, + { + "epoch": 0.5360610263522885, + "grad_norm": 0.40036028972077475, + "learning_rate": 9.99841558861299e-06, + "loss": 0.567, + "step": 773 + }, + { + "epoch": 0.536754507628294, + "grad_norm": 0.33160374278228255, + "learning_rate": 9.998354067138276e-06, + "loss": 0.6293, + "step": 774 + }, + { + "epoch": 0.5374479889042996, + "grad_norm": 0.36155830359485086, + "learning_rate": 9.99829137408557e-06, + "loss": 0.5709, + "step": 775 + }, + { + "epoch": 0.5381414701803051, + "grad_norm": 0.3915838316467222, + "learning_rate": 9.998227509469565e-06, + "loss": 0.5708, + "step": 776 + }, + { + "epoch": 0.5388349514563107, + "grad_norm": 0.3425169869927215, + "learning_rate": 9.998162473305229e-06, + "loss": 0.606, + "step": 777 + }, + { + "epoch": 0.5395284327323162, + "grad_norm": 0.35487253854695056, + "learning_rate": 9.99809626560781e-06, + "loss": 0.6234, + "step": 778 + }, + { + "epoch": 0.5402219140083218, + "grad_norm": 0.3916036489836279, + "learning_rate": 9.998028886392821e-06, + "loss": 0.6292, + "step": 779 + }, + { + "epoch": 0.5409153952843273, + "grad_norm": 0.35658352394717213, + "learning_rate": 9.997960335676062e-06, + "loss": 0.5864, + "step": 780 + }, + { + "epoch": 0.5416088765603329, + "grad_norm": 0.33151812149335985, + "learning_rate": 9.997890613473596e-06, + "loss": 0.5677, + "step": 781 + }, + { + "epoch": 0.5423023578363384, + "grad_norm": 0.33911034555027275, + "learning_rate": 9.997819719801766e-06, + "loss": 0.6162, + "step": 782 + }, + { + "epoch": 0.542995839112344, + "grad_norm": 0.5511777461489347, + "learning_rate": 9.99774765467719e-06, + "loss": 0.6809, + "step": 783 + }, + { + "epoch": 0.5436893203883495, + "grad_norm": 0.309334809552995, + "learning_rate": 9.997674418116759e-06, + "loss": 0.517, + "step": 784 + }, + { + "epoch": 0.544382801664355, + "grad_norm": 0.3434360109327229, + "learning_rate": 9.997600010137638e-06, + "loss": 0.5611, + "step": 785 + }, + { + "epoch": 0.5450762829403606, + "grad_norm": 0.3665740879948818, + "learning_rate": 9.99752443075727e-06, + "loss": 0.5807, + "step": 786 + }, + { + "epoch": 0.5457697642163661, + "grad_norm": 0.3651661888224723, + "learning_rate": 9.99744767999337e-06, + "loss": 0.5799, + "step": 787 + }, + { + "epoch": 0.5464632454923717, + "grad_norm": 0.3480518332197476, + "learning_rate": 9.997369757863926e-06, + "loss": 0.6049, + "step": 788 + }, + { + "epoch": 0.5471567267683772, + "grad_norm": 0.353988081230821, + "learning_rate": 9.997290664387205e-06, + "loss": 0.657, + "step": 789 + }, + { + "epoch": 0.5478502080443828, + "grad_norm": 0.39297584753933945, + "learning_rate": 9.997210399581742e-06, + "loss": 0.6452, + "step": 790 + }, + { + "epoch": 0.5485436893203883, + "grad_norm": 0.3933270411166608, + "learning_rate": 9.997128963466355e-06, + "loss": 0.6874, + "step": 791 + }, + { + "epoch": 0.5492371705963939, + "grad_norm": 0.3843851610970679, + "learning_rate": 9.99704635606013e-06, + "loss": 0.6197, + "step": 792 + }, + { + "epoch": 0.5499306518723994, + "grad_norm": 0.3563726960771155, + "learning_rate": 9.996962577382428e-06, + "loss": 0.6123, + "step": 793 + }, + { + "epoch": 0.550624133148405, + "grad_norm": 0.36131278518348864, + "learning_rate": 9.996877627452888e-06, + "loss": 0.5633, + "step": 794 + }, + { + "epoch": 0.5513176144244105, + "grad_norm": 0.38944813314700005, + "learning_rate": 9.99679150629142e-06, + "loss": 0.6196, + "step": 795 + }, + { + "epoch": 0.5520110957004161, + "grad_norm": 0.39273645679432406, + "learning_rate": 9.996704213918213e-06, + "loss": 0.593, + "step": 796 + }, + { + "epoch": 0.5527045769764216, + "grad_norm": 0.4213437548828512, + "learning_rate": 9.996615750353726e-06, + "loss": 0.5541, + "step": 797 + }, + { + "epoch": 0.5533980582524272, + "grad_norm": 0.3376625421997148, + "learning_rate": 9.996526115618694e-06, + "loss": 0.5734, + "step": 798 + }, + { + "epoch": 0.5540915395284327, + "grad_norm": 0.35609183352280943, + "learning_rate": 9.996435309734127e-06, + "loss": 0.6038, + "step": 799 + }, + { + "epoch": 0.5547850208044383, + "grad_norm": 0.34194422202558405, + "learning_rate": 9.996343332721308e-06, + "loss": 0.6384, + "step": 800 + }, + { + "epoch": 0.5554785020804438, + "grad_norm": 0.45980807415441105, + "learning_rate": 9.9962501846018e-06, + "loss": 0.6478, + "step": 801 + }, + { + "epoch": 0.5561719833564494, + "grad_norm": 0.356131120895226, + "learning_rate": 9.99615586539743e-06, + "loss": 0.5836, + "step": 802 + }, + { + "epoch": 0.5568654646324549, + "grad_norm": 0.3094389747187915, + "learning_rate": 9.99606037513031e-06, + "loss": 0.5792, + "step": 803 + }, + { + "epoch": 0.5575589459084604, + "grad_norm": 0.35424906911747067, + "learning_rate": 9.995963713822823e-06, + "loss": 0.5552, + "step": 804 + }, + { + "epoch": 0.558252427184466, + "grad_norm": 0.37134850928462093, + "learning_rate": 9.995865881497621e-06, + "loss": 0.6395, + "step": 805 + }, + { + "epoch": 0.5589459084604715, + "grad_norm": 0.3396083482831853, + "learning_rate": 9.995766878177641e-06, + "loss": 0.6559, + "step": 806 + }, + { + "epoch": 0.5596393897364771, + "grad_norm": 0.349721990866471, + "learning_rate": 9.995666703886084e-06, + "loss": 0.5718, + "step": 807 + }, + { + "epoch": 0.5603328710124826, + "grad_norm": 0.35772687569475387, + "learning_rate": 9.995565358646432e-06, + "loss": 0.5923, + "step": 808 + }, + { + "epoch": 0.5610263522884882, + "grad_norm": 0.3824059071857025, + "learning_rate": 9.995462842482441e-06, + "loss": 0.5519, + "step": 809 + }, + { + "epoch": 0.5617198335644937, + "grad_norm": 0.3284979543478124, + "learning_rate": 9.995359155418139e-06, + "loss": 0.5794, + "step": 810 + }, + { + "epoch": 0.5624133148404993, + "grad_norm": 0.34818162862155966, + "learning_rate": 9.995254297477827e-06, + "loss": 0.5904, + "step": 811 + }, + { + "epoch": 0.5631067961165048, + "grad_norm": 0.378188504715066, + "learning_rate": 9.995148268686086e-06, + "loss": 0.5819, + "step": 812 + }, + { + "epoch": 0.5638002773925104, + "grad_norm": 0.46087643944959833, + "learning_rate": 9.995041069067767e-06, + "loss": 0.5456, + "step": 813 + }, + { + "epoch": 0.5644937586685159, + "grad_norm": 0.3563574311634951, + "learning_rate": 9.994932698647997e-06, + "loss": 0.5933, + "step": 814 + }, + { + "epoch": 0.5651872399445215, + "grad_norm": 0.3641945948169763, + "learning_rate": 9.994823157452179e-06, + "loss": 0.5946, + "step": 815 + }, + { + "epoch": 0.565880721220527, + "grad_norm": 0.552909363074712, + "learning_rate": 9.994712445505985e-06, + "loss": 0.6261, + "step": 816 + }, + { + "epoch": 0.5665742024965326, + "grad_norm": 0.6516575100223244, + "learning_rate": 9.994600562835368e-06, + "loss": 0.6152, + "step": 817 + }, + { + "epoch": 0.5672676837725381, + "grad_norm": 0.3454900158635478, + "learning_rate": 9.99448750946655e-06, + "loss": 0.6046, + "step": 818 + }, + { + "epoch": 0.5679611650485437, + "grad_norm": 0.35071766339724286, + "learning_rate": 9.994373285426034e-06, + "loss": 0.6314, + "step": 819 + }, + { + "epoch": 0.5686546463245492, + "grad_norm": 0.43416563446791145, + "learning_rate": 9.99425789074059e-06, + "loss": 0.5519, + "step": 820 + }, + { + "epoch": 0.5693481276005548, + "grad_norm": 0.375679142764231, + "learning_rate": 9.994141325437269e-06, + "loss": 0.6496, + "step": 821 + }, + { + "epoch": 0.5700416088765603, + "grad_norm": 0.37883621605024626, + "learning_rate": 9.994023589543387e-06, + "loss": 0.5996, + "step": 822 + }, + { + "epoch": 0.5707350901525658, + "grad_norm": 0.3147504750589824, + "learning_rate": 9.993904683086544e-06, + "loss": 0.5832, + "step": 823 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.38615339122548875, + "learning_rate": 9.993784606094612e-06, + "loss": 0.6072, + "step": 824 + }, + { + "epoch": 0.5721220527045769, + "grad_norm": 0.34055537994616036, + "learning_rate": 9.993663358595736e-06, + "loss": 0.5793, + "step": 825 + }, + { + "epoch": 0.5728155339805825, + "grad_norm": 0.3479708413936868, + "learning_rate": 9.993540940618334e-06, + "loss": 0.5993, + "step": 826 + }, + { + "epoch": 0.573509015256588, + "grad_norm": 0.3430681485698919, + "learning_rate": 9.9934173521911e-06, + "loss": 0.5673, + "step": 827 + }, + { + "epoch": 0.5742024965325936, + "grad_norm": 0.40031566863708545, + "learning_rate": 9.993292593343003e-06, + "loss": 0.7237, + "step": 828 + }, + { + "epoch": 0.5748959778085991, + "grad_norm": 0.3324760719689896, + "learning_rate": 9.993166664103283e-06, + "loss": 0.608, + "step": 829 + }, + { + "epoch": 0.5755894590846047, + "grad_norm": 0.35885140959685163, + "learning_rate": 9.993039564501463e-06, + "loss": 0.6594, + "step": 830 + }, + { + "epoch": 0.5762829403606102, + "grad_norm": 0.399186315746986, + "learning_rate": 9.992911294567328e-06, + "loss": 0.5741, + "step": 831 + }, + { + "epoch": 0.5769764216366158, + "grad_norm": 0.35090334605835627, + "learning_rate": 9.992781854330946e-06, + "loss": 0.6422, + "step": 832 + }, + { + "epoch": 0.5776699029126213, + "grad_norm": 0.31889923872767023, + "learning_rate": 9.992651243822658e-06, + "loss": 0.5733, + "step": 833 + }, + { + "epoch": 0.5783633841886269, + "grad_norm": 0.3670958465002502, + "learning_rate": 9.992519463073077e-06, + "loss": 0.5851, + "step": 834 + }, + { + "epoch": 0.5790568654646324, + "grad_norm": 0.3338916412965299, + "learning_rate": 9.992386512113089e-06, + "loss": 0.5857, + "step": 835 + }, + { + "epoch": 0.579750346740638, + "grad_norm": 0.3583288330461265, + "learning_rate": 9.99225239097386e-06, + "loss": 0.6352, + "step": 836 + }, + { + "epoch": 0.5804438280166435, + "grad_norm": 0.3378780438558911, + "learning_rate": 9.992117099686828e-06, + "loss": 0.5691, + "step": 837 + }, + { + "epoch": 0.5811373092926491, + "grad_norm": 0.3256947131008986, + "learning_rate": 9.9919806382837e-06, + "loss": 0.608, + "step": 838 + }, + { + "epoch": 0.5818307905686546, + "grad_norm": 0.3332284083468561, + "learning_rate": 9.991843006796466e-06, + "loss": 0.5455, + "step": 839 + }, + { + "epoch": 0.5825242718446602, + "grad_norm": 0.34167001170988187, + "learning_rate": 9.991704205257383e-06, + "loss": 0.5673, + "step": 840 + }, + { + "epoch": 0.5832177531206657, + "grad_norm": 0.30566608826041025, + "learning_rate": 9.991564233698986e-06, + "loss": 0.5875, + "step": 841 + }, + { + "epoch": 0.5839112343966713, + "grad_norm": 0.329418583332585, + "learning_rate": 9.991423092154083e-06, + "loss": 0.5327, + "step": 842 + }, + { + "epoch": 0.5846047156726768, + "grad_norm": 0.30507560845133674, + "learning_rate": 9.991280780655757e-06, + "loss": 0.5691, + "step": 843 + }, + { + "epoch": 0.5852981969486823, + "grad_norm": 0.3617280149148349, + "learning_rate": 9.991137299237366e-06, + "loss": 0.6003, + "step": 844 + }, + { + "epoch": 0.5859916782246879, + "grad_norm": 0.3671582873180128, + "learning_rate": 9.990992647932537e-06, + "loss": 0.5819, + "step": 845 + }, + { + "epoch": 0.5866851595006934, + "grad_norm": 0.34613317062835486, + "learning_rate": 9.990846826775179e-06, + "loss": 0.5913, + "step": 846 + }, + { + "epoch": 0.587378640776699, + "grad_norm": 0.36465833562542455, + "learning_rate": 9.99069983579947e-06, + "loss": 0.5707, + "step": 847 + }, + { + "epoch": 0.5880721220527045, + "grad_norm": 0.380488480865335, + "learning_rate": 9.990551675039863e-06, + "loss": 0.6105, + "step": 848 + }, + { + "epoch": 0.5887656033287101, + "grad_norm": 0.3669644067681118, + "learning_rate": 9.990402344531089e-06, + "loss": 0.6314, + "step": 849 + }, + { + "epoch": 0.5894590846047156, + "grad_norm": 0.3634871716474428, + "learning_rate": 9.990251844308145e-06, + "loss": 0.5301, + "step": 850 + }, + { + "epoch": 0.5901525658807212, + "grad_norm": 0.38484347585083906, + "learning_rate": 9.990100174406313e-06, + "loss": 0.6124, + "step": 851 + }, + { + "epoch": 0.5908460471567267, + "grad_norm": 0.6158101712237795, + "learning_rate": 9.989947334861136e-06, + "loss": 0.6281, + "step": 852 + }, + { + "epoch": 0.5915395284327323, + "grad_norm": 0.33260041145785296, + "learning_rate": 9.989793325708446e-06, + "loss": 0.5399, + "step": 853 + }, + { + "epoch": 0.5922330097087378, + "grad_norm": 0.29013409357770487, + "learning_rate": 9.989638146984337e-06, + "loss": 0.507, + "step": 854 + }, + { + "epoch": 0.5929264909847434, + "grad_norm": 0.33762953372103877, + "learning_rate": 9.989481798725182e-06, + "loss": 0.6313, + "step": 855 + }, + { + "epoch": 0.5936199722607489, + "grad_norm": 0.3222219974052387, + "learning_rate": 9.98932428096763e-06, + "loss": 0.537, + "step": 856 + }, + { + "epoch": 0.5943134535367545, + "grad_norm": 0.35485383095369083, + "learning_rate": 9.989165593748602e-06, + "loss": 0.54, + "step": 857 + }, + { + "epoch": 0.59500693481276, + "grad_norm": 0.3414735268209781, + "learning_rate": 9.98900573710529e-06, + "loss": 0.5761, + "step": 858 + }, + { + "epoch": 0.5957004160887656, + "grad_norm": 0.3407146971721609, + "learning_rate": 9.988844711075166e-06, + "loss": 0.6076, + "step": 859 + }, + { + "epoch": 0.5963938973647711, + "grad_norm": 0.42069437792125053, + "learning_rate": 9.988682515695973e-06, + "loss": 0.6642, + "step": 860 + }, + { + "epoch": 0.5970873786407767, + "grad_norm": 0.3347845063179168, + "learning_rate": 9.988519151005728e-06, + "loss": 0.6106, + "step": 861 + }, + { + "epoch": 0.5977808599167822, + "grad_norm": 0.3579141738253978, + "learning_rate": 9.988354617042723e-06, + "loss": 0.5761, + "step": 862 + }, + { + "epoch": 0.5984743411927878, + "grad_norm": 0.3460153846615427, + "learning_rate": 9.988188913845523e-06, + "loss": 0.5772, + "step": 863 + }, + { + "epoch": 0.5991678224687933, + "grad_norm": 0.3222264974453905, + "learning_rate": 9.988022041452968e-06, + "loss": 0.598, + "step": 864 + }, + { + "epoch": 0.5998613037447988, + "grad_norm": 0.41823344334298795, + "learning_rate": 9.987853999904169e-06, + "loss": 0.6456, + "step": 865 + }, + { + "epoch": 0.6005547850208044, + "grad_norm": 0.3750284085110268, + "learning_rate": 9.98768478923852e-06, + "loss": 0.6575, + "step": 866 + }, + { + "epoch": 0.6012482662968099, + "grad_norm": 0.31564041331946846, + "learning_rate": 9.987514409495675e-06, + "loss": 0.5677, + "step": 867 + }, + { + "epoch": 0.6019417475728155, + "grad_norm": 0.37227726250887766, + "learning_rate": 9.987342860715575e-06, + "loss": 0.6333, + "step": 868 + }, + { + "epoch": 0.602635228848821, + "grad_norm": 0.4952523103716889, + "learning_rate": 9.987170142938429e-06, + "loss": 0.6417, + "step": 869 + }, + { + "epoch": 0.6033287101248266, + "grad_norm": 0.37715662413807755, + "learning_rate": 9.98699625620472e-06, + "loss": 0.5638, + "step": 870 + }, + { + "epoch": 0.6040221914008321, + "grad_norm": 0.33870170954321344, + "learning_rate": 9.986821200555206e-06, + "loss": 0.5748, + "step": 871 + }, + { + "epoch": 0.6047156726768377, + "grad_norm": 0.447394695448431, + "learning_rate": 9.98664497603092e-06, + "loss": 0.5667, + "step": 872 + }, + { + "epoch": 0.6054091539528432, + "grad_norm": 0.31993512669500684, + "learning_rate": 9.986467582673166e-06, + "loss": 0.6104, + "step": 873 + }, + { + "epoch": 0.6061026352288488, + "grad_norm": 0.34121670021324146, + "learning_rate": 9.986289020523525e-06, + "loss": 0.6298, + "step": 874 + }, + { + "epoch": 0.6067961165048543, + "grad_norm": 0.3352939710398173, + "learning_rate": 9.986109289623848e-06, + "loss": 0.5684, + "step": 875 + }, + { + "epoch": 0.6074895977808599, + "grad_norm": 0.32570600275910855, + "learning_rate": 9.985928390016267e-06, + "loss": 0.6172, + "step": 876 + }, + { + "epoch": 0.6081830790568654, + "grad_norm": 0.3132585321745443, + "learning_rate": 9.985746321743179e-06, + "loss": 0.5781, + "step": 877 + }, + { + "epoch": 0.608876560332871, + "grad_norm": 0.3334371299742966, + "learning_rate": 9.985563084847263e-06, + "loss": 0.5267, + "step": 878 + }, + { + "epoch": 0.6095700416088765, + "grad_norm": 0.37735940500200976, + "learning_rate": 9.985378679371465e-06, + "loss": 0.585, + "step": 879 + }, + { + "epoch": 0.6102635228848821, + "grad_norm": 0.3307596514634534, + "learning_rate": 9.985193105359013e-06, + "loss": 0.612, + "step": 880 + }, + { + "epoch": 0.6109570041608876, + "grad_norm": 0.3661289923865791, + "learning_rate": 9.9850063628534e-06, + "loss": 0.6442, + "step": 881 + }, + { + "epoch": 0.6116504854368932, + "grad_norm": 0.42036366886329213, + "learning_rate": 9.984818451898399e-06, + "loss": 0.6403, + "step": 882 + }, + { + "epoch": 0.6123439667128987, + "grad_norm": 0.3686093860629796, + "learning_rate": 9.984629372538055e-06, + "loss": 0.5862, + "step": 883 + }, + { + "epoch": 0.6130374479889042, + "grad_norm": 0.33930001299808243, + "learning_rate": 9.984439124816687e-06, + "loss": 0.5689, + "step": 884 + }, + { + "epoch": 0.6137309292649098, + "grad_norm": 0.3243308830978303, + "learning_rate": 9.984247708778887e-06, + "loss": 0.559, + "step": 885 + }, + { + "epoch": 0.6144244105409153, + "grad_norm": 0.32935726192149983, + "learning_rate": 9.98405512446952e-06, + "loss": 0.5722, + "step": 886 + }, + { + "epoch": 0.6151178918169209, + "grad_norm": 0.34674025215944076, + "learning_rate": 9.98386137193373e-06, + "loss": 0.5535, + "step": 887 + }, + { + "epoch": 0.6158113730929264, + "grad_norm": 0.31388258320678464, + "learning_rate": 9.983666451216927e-06, + "loss": 0.5374, + "step": 888 + }, + { + "epoch": 0.616504854368932, + "grad_norm": 0.33331461141805196, + "learning_rate": 9.983470362364803e-06, + "loss": 0.5992, + "step": 889 + }, + { + "epoch": 0.6171983356449375, + "grad_norm": 0.3377370185432441, + "learning_rate": 9.983273105423317e-06, + "loss": 0.5967, + "step": 890 + }, + { + "epoch": 0.6178918169209431, + "grad_norm": 0.33479539583423634, + "learning_rate": 9.983074680438707e-06, + "loss": 0.5522, + "step": 891 + }, + { + "epoch": 0.6185852981969486, + "grad_norm": 0.3126860064909479, + "learning_rate": 9.98287508745748e-06, + "loss": 0.5624, + "step": 892 + }, + { + "epoch": 0.6192787794729542, + "grad_norm": 0.3946057135855722, + "learning_rate": 9.98267432652642e-06, + "loss": 0.6693, + "step": 893 + }, + { + "epoch": 0.6199722607489597, + "grad_norm": 0.32483191440023923, + "learning_rate": 9.982472397692585e-06, + "loss": 0.6319, + "step": 894 + }, + { + "epoch": 0.6206657420249653, + "grad_norm": 0.33232060016498727, + "learning_rate": 9.982269301003305e-06, + "loss": 0.5309, + "step": 895 + }, + { + "epoch": 0.6213592233009708, + "grad_norm": 0.3147782800255537, + "learning_rate": 9.982065036506183e-06, + "loss": 0.5672, + "step": 896 + }, + { + "epoch": 0.6220527045769764, + "grad_norm": 0.33633547096338834, + "learning_rate": 9.981859604249098e-06, + "loss": 0.5898, + "step": 897 + }, + { + "epoch": 0.6227461858529819, + "grad_norm": 0.3918801038501328, + "learning_rate": 9.981653004280203e-06, + "loss": 0.5986, + "step": 898 + }, + { + "epoch": 0.6234396671289875, + "grad_norm": 0.38464214514703654, + "learning_rate": 9.981445236647923e-06, + "loss": 0.5878, + "step": 899 + }, + { + "epoch": 0.624133148404993, + "grad_norm": 0.30436655993558426, + "learning_rate": 9.981236301400955e-06, + "loss": 0.5737, + "step": 900 + }, + { + "epoch": 0.6248266296809986, + "grad_norm": 0.3189560160759198, + "learning_rate": 9.981026198588276e-06, + "loss": 0.6132, + "step": 901 + }, + { + "epoch": 0.6255201109570042, + "grad_norm": 0.3550589009893856, + "learning_rate": 9.980814928259129e-06, + "loss": 0.6255, + "step": 902 + }, + { + "epoch": 0.6262135922330098, + "grad_norm": 0.4997772712303221, + "learning_rate": 9.980602490463037e-06, + "loss": 0.5564, + "step": 903 + }, + { + "epoch": 0.6269070735090153, + "grad_norm": 0.34593571571772425, + "learning_rate": 9.98038888524979e-06, + "loss": 0.5459, + "step": 904 + }, + { + "epoch": 0.6276005547850209, + "grad_norm": 0.35195237449428995, + "learning_rate": 9.98017411266946e-06, + "loss": 0.5999, + "step": 905 + }, + { + "epoch": 0.6282940360610264, + "grad_norm": 0.3494946695679501, + "learning_rate": 9.979958172772386e-06, + "loss": 0.6094, + "step": 906 + }, + { + "epoch": 0.628987517337032, + "grad_norm": 0.32944296945437185, + "learning_rate": 9.979741065609182e-06, + "loss": 0.6239, + "step": 907 + }, + { + "epoch": 0.6296809986130375, + "grad_norm": 0.343188892969881, + "learning_rate": 9.979522791230739e-06, + "loss": 0.5749, + "step": 908 + }, + { + "epoch": 0.630374479889043, + "grad_norm": 0.3431693932435067, + "learning_rate": 9.979303349688214e-06, + "loss": 0.6418, + "step": 909 + }, + { + "epoch": 0.6310679611650486, + "grad_norm": 0.6260916997900723, + "learning_rate": 9.979082741033047e-06, + "loss": 0.6348, + "step": 910 + }, + { + "epoch": 0.6317614424410541, + "grad_norm": 0.3459150533249548, + "learning_rate": 9.978860965316945e-06, + "loss": 0.5591, + "step": 911 + }, + { + "epoch": 0.6324549237170597, + "grad_norm": 0.33333027810277965, + "learning_rate": 9.978638022591894e-06, + "loss": 0.5787, + "step": 912 + }, + { + "epoch": 0.6331484049930652, + "grad_norm": 0.32706243660827083, + "learning_rate": 9.978413912910145e-06, + "loss": 0.598, + "step": 913 + }, + { + "epoch": 0.6338418862690708, + "grad_norm": 0.3190908490026014, + "learning_rate": 9.978188636324231e-06, + "loss": 0.5783, + "step": 914 + }, + { + "epoch": 0.6345353675450763, + "grad_norm": 0.3449967775841397, + "learning_rate": 9.977962192886954e-06, + "loss": 0.5929, + "step": 915 + }, + { + "epoch": 0.6352288488210819, + "grad_norm": 0.3760267285165008, + "learning_rate": 9.977734582651393e-06, + "loss": 0.6492, + "step": 916 + }, + { + "epoch": 0.6359223300970874, + "grad_norm": 0.33146825505524, + "learning_rate": 9.977505805670895e-06, + "loss": 0.6276, + "step": 917 + }, + { + "epoch": 0.636615811373093, + "grad_norm": 0.3225866570301169, + "learning_rate": 9.977275861999084e-06, + "loss": 0.524, + "step": 918 + }, + { + "epoch": 0.6373092926490985, + "grad_norm": 0.3413315589556036, + "learning_rate": 9.977044751689858e-06, + "loss": 0.6302, + "step": 919 + }, + { + "epoch": 0.6380027739251041, + "grad_norm": 0.316767161688686, + "learning_rate": 9.976812474797388e-06, + "loss": 0.5139, + "step": 920 + }, + { + "epoch": 0.6386962552011096, + "grad_norm": 0.336423549371417, + "learning_rate": 9.97657903137612e-06, + "loss": 0.603, + "step": 921 + }, + { + "epoch": 0.6393897364771152, + "grad_norm": 0.3590292032224001, + "learning_rate": 9.976344421480766e-06, + "loss": 0.6633, + "step": 922 + }, + { + "epoch": 0.6400832177531207, + "grad_norm": 0.3409483030153749, + "learning_rate": 9.976108645166322e-06, + "loss": 0.6064, + "step": 923 + }, + { + "epoch": 0.6407766990291263, + "grad_norm": 0.3424195023715099, + "learning_rate": 9.97587170248805e-06, + "loss": 0.5987, + "step": 924 + }, + { + "epoch": 0.6414701803051318, + "grad_norm": 0.33700917472951303, + "learning_rate": 9.975633593501485e-06, + "loss": 0.6875, + "step": 925 + }, + { + "epoch": 0.6421636615811374, + "grad_norm": 0.32836587584052596, + "learning_rate": 9.975394318262443e-06, + "loss": 0.6266, + "step": 926 + }, + { + "epoch": 0.6428571428571429, + "grad_norm": 0.33101502685906026, + "learning_rate": 9.975153876827008e-06, + "loss": 0.6597, + "step": 927 + }, + { + "epoch": 0.6435506241331485, + "grad_norm": 0.33255835528358707, + "learning_rate": 9.974912269251534e-06, + "loss": 0.536, + "step": 928 + }, + { + "epoch": 0.644244105409154, + "grad_norm": 0.3603313792246961, + "learning_rate": 9.974669495592655e-06, + "loss": 0.594, + "step": 929 + }, + { + "epoch": 0.6449375866851595, + "grad_norm": 0.4947418665609005, + "learning_rate": 9.974425555907275e-06, + "loss": 0.5974, + "step": 930 + }, + { + "epoch": 0.6456310679611651, + "grad_norm": 0.3081307295348536, + "learning_rate": 9.97418045025257e-06, + "loss": 0.5338, + "step": 931 + }, + { + "epoch": 0.6463245492371706, + "grad_norm": 0.34525440992902945, + "learning_rate": 9.973934178685992e-06, + "loss": 0.5841, + "step": 932 + }, + { + "epoch": 0.6470180305131762, + "grad_norm": 0.3685839827243698, + "learning_rate": 9.973686741265265e-06, + "loss": 0.5864, + "step": 933 + }, + { + "epoch": 0.6477115117891817, + "grad_norm": 0.35534408188736244, + "learning_rate": 9.973438138048389e-06, + "loss": 0.5422, + "step": 934 + }, + { + "epoch": 0.6484049930651873, + "grad_norm": 0.3906368162822919, + "learning_rate": 9.973188369093631e-06, + "loss": 0.5768, + "step": 935 + }, + { + "epoch": 0.6490984743411928, + "grad_norm": 0.36012023338893784, + "learning_rate": 9.972937434459538e-06, + "loss": 0.545, + "step": 936 + }, + { + "epoch": 0.6497919556171984, + "grad_norm": 0.4037319051285458, + "learning_rate": 9.972685334204926e-06, + "loss": 0.6142, + "step": 937 + }, + { + "epoch": 0.6504854368932039, + "grad_norm": 0.3562140193168272, + "learning_rate": 9.972432068388885e-06, + "loss": 0.6215, + "step": 938 + }, + { + "epoch": 0.6511789181692095, + "grad_norm": 0.3334502163629595, + "learning_rate": 9.972177637070779e-06, + "loss": 0.624, + "step": 939 + }, + { + "epoch": 0.651872399445215, + "grad_norm": 0.3391506641330937, + "learning_rate": 9.971922040310244e-06, + "loss": 0.5857, + "step": 940 + }, + { + "epoch": 0.6525658807212206, + "grad_norm": 0.31304795665423457, + "learning_rate": 9.971665278167193e-06, + "loss": 0.5541, + "step": 941 + }, + { + "epoch": 0.6532593619972261, + "grad_norm": 0.3470678124095459, + "learning_rate": 9.971407350701808e-06, + "loss": 0.6141, + "step": 942 + }, + { + "epoch": 0.6539528432732317, + "grad_norm": 0.3640580816601225, + "learning_rate": 9.971148257974543e-06, + "loss": 0.5779, + "step": 943 + }, + { + "epoch": 0.6546463245492372, + "grad_norm": 0.36381434024896536, + "learning_rate": 9.97088800004613e-06, + "loss": 0.549, + "step": 944 + }, + { + "epoch": 0.6553398058252428, + "grad_norm": 0.40933415568980586, + "learning_rate": 9.97062657697757e-06, + "loss": 0.5656, + "step": 945 + }, + { + "epoch": 0.6560332871012483, + "grad_norm": 0.3588854690529746, + "learning_rate": 9.970363988830138e-06, + "loss": 0.597, + "step": 946 + }, + { + "epoch": 0.6567267683772539, + "grad_norm": 0.36464656084080477, + "learning_rate": 9.970100235665386e-06, + "loss": 0.5743, + "step": 947 + }, + { + "epoch": 0.6574202496532594, + "grad_norm": 0.32076356849363635, + "learning_rate": 9.969835317545133e-06, + "loss": 0.613, + "step": 948 + }, + { + "epoch": 0.658113730929265, + "grad_norm": 0.35441860473056175, + "learning_rate": 9.969569234531475e-06, + "loss": 0.6019, + "step": 949 + }, + { + "epoch": 0.6588072122052705, + "grad_norm": 0.3494071986318162, + "learning_rate": 9.969301986686782e-06, + "loss": 0.5707, + "step": 950 + }, + { + "epoch": 0.659500693481276, + "grad_norm": 0.34668012733349474, + "learning_rate": 9.969033574073689e-06, + "loss": 0.5333, + "step": 951 + }, + { + "epoch": 0.6601941747572816, + "grad_norm": 0.3338094515703168, + "learning_rate": 9.968763996755115e-06, + "loss": 0.5743, + "step": 952 + }, + { + "epoch": 0.6608876560332871, + "grad_norm": 0.33161620384603135, + "learning_rate": 9.968493254794247e-06, + "loss": 0.5493, + "step": 953 + }, + { + "epoch": 0.6615811373092927, + "grad_norm": 0.34568897389513337, + "learning_rate": 9.968221348254543e-06, + "loss": 0.601, + "step": 954 + }, + { + "epoch": 0.6622746185852982, + "grad_norm": 0.34684248566121006, + "learning_rate": 9.967948277199736e-06, + "loss": 0.5742, + "step": 955 + }, + { + "epoch": 0.6629680998613038, + "grad_norm": 0.3425555990494531, + "learning_rate": 9.967674041693831e-06, + "loss": 0.6075, + "step": 956 + }, + { + "epoch": 0.6636615811373093, + "grad_norm": 0.3336947049702203, + "learning_rate": 9.967398641801111e-06, + "loss": 0.5625, + "step": 957 + }, + { + "epoch": 0.6643550624133149, + "grad_norm": 0.3729854187748714, + "learning_rate": 9.967122077586124e-06, + "loss": 0.5725, + "step": 958 + }, + { + "epoch": 0.6650485436893204, + "grad_norm": 0.3357350735840092, + "learning_rate": 9.966844349113695e-06, + "loss": 0.5786, + "step": 959 + }, + { + "epoch": 0.665742024965326, + "grad_norm": 0.33270854302770486, + "learning_rate": 9.966565456448923e-06, + "loss": 0.5685, + "step": 960 + }, + { + "epoch": 0.6664355062413315, + "grad_norm": 0.31271771303028706, + "learning_rate": 9.966285399657175e-06, + "loss": 0.5746, + "step": 961 + }, + { + "epoch": 0.6671289875173371, + "grad_norm": 0.3367176285583568, + "learning_rate": 9.9660041788041e-06, + "loss": 0.5687, + "step": 962 + }, + { + "epoch": 0.6678224687933426, + "grad_norm": 0.31678615811257615, + "learning_rate": 9.965721793955609e-06, + "loss": 0.5883, + "step": 963 + }, + { + "epoch": 0.6685159500693482, + "grad_norm": 0.34291902832443727, + "learning_rate": 9.965438245177895e-06, + "loss": 0.6012, + "step": 964 + }, + { + "epoch": 0.6692094313453537, + "grad_norm": 0.3809367221460459, + "learning_rate": 9.965153532537416e-06, + "loss": 0.6233, + "step": 965 + }, + { + "epoch": 0.6699029126213593, + "grad_norm": 0.3547760519179594, + "learning_rate": 9.96486765610091e-06, + "loss": 0.5778, + "step": 966 + }, + { + "epoch": 0.6705963938973648, + "grad_norm": 0.36668523892323834, + "learning_rate": 9.96458061593538e-06, + "loss": 0.5746, + "step": 967 + }, + { + "epoch": 0.6712898751733704, + "grad_norm": 0.328016965053142, + "learning_rate": 9.964292412108109e-06, + "loss": 0.5915, + "step": 968 + }, + { + "epoch": 0.6719833564493759, + "grad_norm": 0.34407606317022954, + "learning_rate": 9.964003044686653e-06, + "loss": 0.5501, + "step": 969 + }, + { + "epoch": 0.6726768377253814, + "grad_norm": 0.35674597332836255, + "learning_rate": 9.963712513738832e-06, + "loss": 0.5614, + "step": 970 + }, + { + "epoch": 0.673370319001387, + "grad_norm": 0.341599707279666, + "learning_rate": 9.963420819332747e-06, + "loss": 0.5648, + "step": 971 + }, + { + "epoch": 0.6740638002773925, + "grad_norm": 0.32935977963907553, + "learning_rate": 9.963127961536769e-06, + "loss": 0.527, + "step": 972 + }, + { + "epoch": 0.6747572815533981, + "grad_norm": 0.4838100570879444, + "learning_rate": 9.96283394041954e-06, + "loss": 0.6332, + "step": 973 + }, + { + "epoch": 0.6754507628294036, + "grad_norm": 0.32862581119192874, + "learning_rate": 9.96253875604998e-06, + "loss": 0.5536, + "step": 974 + }, + { + "epoch": 0.6761442441054092, + "grad_norm": 0.3912141034292813, + "learning_rate": 9.962242408497274e-06, + "loss": 0.5508, + "step": 975 + }, + { + "epoch": 0.6768377253814147, + "grad_norm": 0.33381614494541395, + "learning_rate": 9.961944897830886e-06, + "loss": 0.5854, + "step": 976 + }, + { + "epoch": 0.6775312066574203, + "grad_norm": 0.36732923327552414, + "learning_rate": 9.961646224120551e-06, + "loss": 0.6146, + "step": 977 + }, + { + "epoch": 0.6782246879334258, + "grad_norm": 0.33845756731512416, + "learning_rate": 9.961346387436275e-06, + "loss": 0.6295, + "step": 978 + }, + { + "epoch": 0.6789181692094314, + "grad_norm": 0.34261448107799614, + "learning_rate": 9.961045387848338e-06, + "loss": 0.5808, + "step": 979 + }, + { + "epoch": 0.6796116504854369, + "grad_norm": 0.3187785442319545, + "learning_rate": 9.96074322542729e-06, + "loss": 0.5365, + "step": 980 + }, + { + "epoch": 0.6803051317614425, + "grad_norm": 0.32141773690416076, + "learning_rate": 9.960439900243959e-06, + "loss": 0.5913, + "step": 981 + }, + { + "epoch": 0.680998613037448, + "grad_norm": 0.3447109074751309, + "learning_rate": 9.960135412369441e-06, + "loss": 0.5398, + "step": 982 + }, + { + "epoch": 0.6816920943134536, + "grad_norm": 0.3505630074917437, + "learning_rate": 9.959829761875104e-06, + "loss": 0.6408, + "step": 983 + }, + { + "epoch": 0.6823855755894591, + "grad_norm": 0.3002610446520616, + "learning_rate": 9.959522948832591e-06, + "loss": 0.564, + "step": 984 + }, + { + "epoch": 0.6830790568654647, + "grad_norm": 0.3442848128163253, + "learning_rate": 9.959214973313818e-06, + "loss": 0.505, + "step": 985 + }, + { + "epoch": 0.6837725381414702, + "grad_norm": 0.3261013165861402, + "learning_rate": 9.958905835390972e-06, + "loss": 0.6332, + "step": 986 + }, + { + "epoch": 0.6844660194174758, + "grad_norm": 0.47876384543350736, + "learning_rate": 9.958595535136511e-06, + "loss": 0.5859, + "step": 987 + }, + { + "epoch": 0.6851595006934813, + "grad_norm": 0.34263392662042647, + "learning_rate": 9.95828407262317e-06, + "loss": 0.6131, + "step": 988 + }, + { + "epoch": 0.6858529819694869, + "grad_norm": 0.2971681507927114, + "learning_rate": 9.95797144792395e-06, + "loss": 0.5417, + "step": 989 + }, + { + "epoch": 0.6865464632454924, + "grad_norm": 0.36103153363073187, + "learning_rate": 9.957657661112133e-06, + "loss": 0.6176, + "step": 990 + }, + { + "epoch": 0.687239944521498, + "grad_norm": 0.35769346631114884, + "learning_rate": 9.957342712261263e-06, + "loss": 0.6555, + "step": 991 + }, + { + "epoch": 0.6879334257975035, + "grad_norm": 0.40510050967209915, + "learning_rate": 9.957026601445166e-06, + "loss": 0.63, + "step": 992 + }, + { + "epoch": 0.688626907073509, + "grad_norm": 0.32464645725429453, + "learning_rate": 9.95670932873793e-06, + "loss": 0.5737, + "step": 993 + }, + { + "epoch": 0.6893203883495146, + "grad_norm": 0.3400254370527942, + "learning_rate": 9.95639089421393e-06, + "loss": 0.5676, + "step": 994 + }, + { + "epoch": 0.6900138696255201, + "grad_norm": 0.4165921946357827, + "learning_rate": 9.956071297947798e-06, + "loss": 0.5453, + "step": 995 + }, + { + "epoch": 0.6907073509015257, + "grad_norm": 0.37321196629047915, + "learning_rate": 9.955750540014448e-06, + "loss": 0.5338, + "step": 996 + }, + { + "epoch": 0.6914008321775312, + "grad_norm": 0.33921327437833365, + "learning_rate": 9.955428620489062e-06, + "loss": 0.5379, + "step": 997 + }, + { + "epoch": 0.6920943134535368, + "grad_norm": 0.35440041687724594, + "learning_rate": 9.955105539447096e-06, + "loss": 0.6165, + "step": 998 + }, + { + "epoch": 0.6927877947295423, + "grad_norm": 0.3285590688117563, + "learning_rate": 9.954781296964279e-06, + "loss": 0.6196, + "step": 999 + }, + { + "epoch": 0.6934812760055479, + "grad_norm": 0.371636361595271, + "learning_rate": 9.95445589311661e-06, + "loss": 0.6161, + "step": 1000 + }, + { + "epoch": 0.6941747572815534, + "grad_norm": 0.3326524858879347, + "learning_rate": 9.954129327980362e-06, + "loss": 0.5515, + "step": 1001 + }, + { + "epoch": 0.694868238557559, + "grad_norm": 0.36226624672883995, + "learning_rate": 9.953801601632079e-06, + "loss": 0.5183, + "step": 1002 + }, + { + "epoch": 0.6955617198335645, + "grad_norm": 0.32761164833965783, + "learning_rate": 9.953472714148576e-06, + "loss": 0.5383, + "step": 1003 + }, + { + "epoch": 0.6962552011095701, + "grad_norm": 0.3464952058046187, + "learning_rate": 9.953142665606945e-06, + "loss": 0.629, + "step": 1004 + }, + { + "epoch": 0.6969486823855756, + "grad_norm": 0.3074158681596855, + "learning_rate": 9.952811456084546e-06, + "loss": 0.5031, + "step": 1005 + }, + { + "epoch": 0.6976421636615812, + "grad_norm": 0.338496489557684, + "learning_rate": 9.95247908565901e-06, + "loss": 0.6012, + "step": 1006 + }, + { + "epoch": 0.6983356449375867, + "grad_norm": 0.38832683416829106, + "learning_rate": 9.952145554408245e-06, + "loss": 0.594, + "step": 1007 + }, + { + "epoch": 0.6990291262135923, + "grad_norm": 0.3231626607418699, + "learning_rate": 9.951810862410426e-06, + "loss": 0.5732, + "step": 1008 + }, + { + "epoch": 0.6997226074895978, + "grad_norm": 0.3526993233542111, + "learning_rate": 9.951475009744006e-06, + "loss": 0.6082, + "step": 1009 + }, + { + "epoch": 0.7004160887656034, + "grad_norm": 0.3329768741425063, + "learning_rate": 9.951137996487703e-06, + "loss": 0.5693, + "step": 1010 + }, + { + "epoch": 0.7011095700416089, + "grad_norm": 0.33207539616272197, + "learning_rate": 9.95079982272051e-06, + "loss": 0.5939, + "step": 1011 + }, + { + "epoch": 0.7018030513176144, + "grad_norm": 0.3510554259780628, + "learning_rate": 9.950460488521695e-06, + "loss": 0.5877, + "step": 1012 + }, + { + "epoch": 0.70249653259362, + "grad_norm": 0.3461173179627518, + "learning_rate": 9.950119993970794e-06, + "loss": 0.5952, + "step": 1013 + }, + { + "epoch": 0.7031900138696255, + "grad_norm": 0.3487381221556316, + "learning_rate": 9.949778339147617e-06, + "loss": 0.5232, + "step": 1014 + }, + { + "epoch": 0.7038834951456311, + "grad_norm": 0.3271125852802924, + "learning_rate": 9.949435524132245e-06, + "loss": 0.5609, + "step": 1015 + }, + { + "epoch": 0.7045769764216366, + "grad_norm": 0.3665837812231856, + "learning_rate": 9.949091549005033e-06, + "loss": 0.5713, + "step": 1016 + }, + { + "epoch": 0.7052704576976422, + "grad_norm": 0.34228569768066913, + "learning_rate": 9.948746413846604e-06, + "loss": 0.6004, + "step": 1017 + }, + { + "epoch": 0.7059639389736477, + "grad_norm": 0.3356409857094149, + "learning_rate": 9.948400118737856e-06, + "loss": 0.6336, + "step": 1018 + }, + { + "epoch": 0.7066574202496533, + "grad_norm": 0.342166029096126, + "learning_rate": 9.948052663759957e-06, + "loss": 0.5944, + "step": 1019 + }, + { + "epoch": 0.7073509015256588, + "grad_norm": 0.36601108993914533, + "learning_rate": 9.947704048994351e-06, + "loss": 0.5713, + "step": 1020 + }, + { + "epoch": 0.7080443828016644, + "grad_norm": 0.33386006159619147, + "learning_rate": 9.947354274522748e-06, + "loss": 0.5633, + "step": 1021 + }, + { + "epoch": 0.7087378640776699, + "grad_norm": 0.30572169905213814, + "learning_rate": 9.947003340427134e-06, + "loss": 0.5985, + "step": 1022 + }, + { + "epoch": 0.7094313453536755, + "grad_norm": 0.31640757934649083, + "learning_rate": 9.946651246789765e-06, + "loss": 0.564, + "step": 1023 + }, + { + "epoch": 0.710124826629681, + "grad_norm": 0.3670258161330911, + "learning_rate": 9.946297993693168e-06, + "loss": 0.5787, + "step": 1024 + }, + { + "epoch": 0.7108183079056866, + "grad_norm": 0.34322370570001765, + "learning_rate": 9.945943581220144e-06, + "loss": 0.5503, + "step": 1025 + }, + { + "epoch": 0.7115117891816921, + "grad_norm": 0.340602804346554, + "learning_rate": 9.945588009453766e-06, + "loss": 0.5732, + "step": 1026 + }, + { + "epoch": 0.7122052704576977, + "grad_norm": 0.38698430691899954, + "learning_rate": 9.945231278477374e-06, + "loss": 0.52, + "step": 1027 + }, + { + "epoch": 0.7128987517337032, + "grad_norm": 0.34545318871864483, + "learning_rate": 9.94487338837459e-06, + "loss": 0.6157, + "step": 1028 + }, + { + "epoch": 0.7135922330097088, + "grad_norm": 0.32974241410967664, + "learning_rate": 9.944514339229292e-06, + "loss": 0.5297, + "step": 1029 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.3237198660503851, + "learning_rate": 9.944154131125643e-06, + "loss": 0.5427, + "step": 1030 + }, + { + "epoch": 0.7149791955617198, + "grad_norm": 0.32577249272541753, + "learning_rate": 9.943792764148074e-06, + "loss": 0.5882, + "step": 1031 + }, + { + "epoch": 0.7156726768377254, + "grad_norm": 0.35641603035666863, + "learning_rate": 9.943430238381286e-06, + "loss": 0.5201, + "step": 1032 + }, + { + "epoch": 0.7163661581137309, + "grad_norm": 0.3388199217233206, + "learning_rate": 9.943066553910252e-06, + "loss": 0.5842, + "step": 1033 + }, + { + "epoch": 0.7170596393897365, + "grad_norm": 0.35853035041766645, + "learning_rate": 9.942701710820217e-06, + "loss": 0.5873, + "step": 1034 + }, + { + "epoch": 0.717753120665742, + "grad_norm": 0.3812354567225615, + "learning_rate": 9.942335709196697e-06, + "loss": 0.6516, + "step": 1035 + }, + { + "epoch": 0.7184466019417476, + "grad_norm": 0.3330409448434271, + "learning_rate": 9.941968549125481e-06, + "loss": 0.5635, + "step": 1036 + }, + { + "epoch": 0.7191400832177531, + "grad_norm": 0.3448194454185852, + "learning_rate": 9.94160023069263e-06, + "loss": 0.5404, + "step": 1037 + }, + { + "epoch": 0.7198335644937587, + "grad_norm": 0.39385124207811567, + "learning_rate": 9.941230753984472e-06, + "loss": 0.6243, + "step": 1038 + }, + { + "epoch": 0.7205270457697642, + "grad_norm": 0.3487501152480796, + "learning_rate": 9.940860119087612e-06, + "loss": 0.5353, + "step": 1039 + }, + { + "epoch": 0.7212205270457698, + "grad_norm": 0.3643088946026779, + "learning_rate": 9.940488326088924e-06, + "loss": 0.6251, + "step": 1040 + }, + { + "epoch": 0.7219140083217753, + "grad_norm": 0.3432021499024788, + "learning_rate": 9.940115375075551e-06, + "loss": 0.6495, + "step": 1041 + }, + { + "epoch": 0.7226074895977809, + "grad_norm": 0.3367973243594567, + "learning_rate": 9.939741266134914e-06, + "loss": 0.6211, + "step": 1042 + }, + { + "epoch": 0.7233009708737864, + "grad_norm": 0.34437975080665395, + "learning_rate": 9.9393659993547e-06, + "loss": 0.6302, + "step": 1043 + }, + { + "epoch": 0.723994452149792, + "grad_norm": 0.39039845823464975, + "learning_rate": 9.938989574822866e-06, + "loss": 0.579, + "step": 1044 + }, + { + "epoch": 0.7246879334257975, + "grad_norm": 0.32845623476097796, + "learning_rate": 9.938611992627647e-06, + "loss": 0.527, + "step": 1045 + }, + { + "epoch": 0.7253814147018031, + "grad_norm": 0.3181699129320863, + "learning_rate": 9.938233252857544e-06, + "loss": 0.5715, + "step": 1046 + }, + { + "epoch": 0.7260748959778086, + "grad_norm": 0.41892463315658773, + "learning_rate": 9.937853355601331e-06, + "loss": 0.5289, + "step": 1047 + }, + { + "epoch": 0.7267683772538142, + "grad_norm": 0.33898184108626256, + "learning_rate": 9.937472300948053e-06, + "loss": 0.6499, + "step": 1048 + }, + { + "epoch": 0.7274618585298197, + "grad_norm": 0.36282529694296206, + "learning_rate": 9.937090088987028e-06, + "loss": 0.6138, + "step": 1049 + }, + { + "epoch": 0.7281553398058253, + "grad_norm": 0.3418792229555562, + "learning_rate": 9.936706719807839e-06, + "loss": 0.5443, + "step": 1050 + }, + { + "epoch": 0.7288488210818308, + "grad_norm": 0.3505297479017804, + "learning_rate": 9.936322193500349e-06, + "loss": 0.5182, + "step": 1051 + }, + { + "epoch": 0.7295423023578363, + "grad_norm": 0.3519624416218966, + "learning_rate": 9.935936510154689e-06, + "loss": 0.5676, + "step": 1052 + }, + { + "epoch": 0.7302357836338419, + "grad_norm": 0.3113359479294091, + "learning_rate": 9.935549669861257e-06, + "loss": 0.5858, + "step": 1053 + }, + { + "epoch": 0.7309292649098474, + "grad_norm": 0.33252266872190694, + "learning_rate": 9.935161672710728e-06, + "loss": 0.5842, + "step": 1054 + }, + { + "epoch": 0.731622746185853, + "grad_norm": 0.34930045036354523, + "learning_rate": 9.934772518794047e-06, + "loss": 0.5553, + "step": 1055 + }, + { + "epoch": 0.7323162274618585, + "grad_norm": 0.3606440647866921, + "learning_rate": 9.934382208202425e-06, + "loss": 0.5665, + "step": 1056 + }, + { + "epoch": 0.7330097087378641, + "grad_norm": 0.35467096078452276, + "learning_rate": 9.93399074102735e-06, + "loss": 0.565, + "step": 1057 + }, + { + "epoch": 0.7337031900138696, + "grad_norm": 0.3322547061127572, + "learning_rate": 9.93359811736058e-06, + "loss": 0.5752, + "step": 1058 + }, + { + "epoch": 0.7343966712898752, + "grad_norm": 0.2992989740354364, + "learning_rate": 9.93320433729414e-06, + "loss": 0.5149, + "step": 1059 + }, + { + "epoch": 0.7350901525658807, + "grad_norm": 0.33417326760115984, + "learning_rate": 9.932809400920332e-06, + "loss": 0.5534, + "step": 1060 + }, + { + "epoch": 0.7357836338418863, + "grad_norm": 0.37007578925035495, + "learning_rate": 9.932413308331725e-06, + "loss": 0.5881, + "step": 1061 + }, + { + "epoch": 0.7364771151178918, + "grad_norm": 0.34269131669156866, + "learning_rate": 9.932016059621161e-06, + "loss": 0.5604, + "step": 1062 + }, + { + "epoch": 0.7371705963938974, + "grad_norm": 0.33546161759477605, + "learning_rate": 9.931617654881753e-06, + "loss": 0.5589, + "step": 1063 + }, + { + "epoch": 0.7378640776699029, + "grad_norm": 0.3847215713142921, + "learning_rate": 9.931218094206882e-06, + "loss": 0.6045, + "step": 1064 + }, + { + "epoch": 0.7385575589459085, + "grad_norm": 0.3367235220263564, + "learning_rate": 9.930817377690205e-06, + "loss": 0.5897, + "step": 1065 + }, + { + "epoch": 0.739251040221914, + "grad_norm": 0.3406355566353876, + "learning_rate": 9.930415505425644e-06, + "loss": 0.5224, + "step": 1066 + }, + { + "epoch": 0.7399445214979196, + "grad_norm": 0.32686734158684394, + "learning_rate": 9.930012477507397e-06, + "loss": 0.6715, + "step": 1067 + }, + { + "epoch": 0.7406380027739251, + "grad_norm": 0.41130446696006584, + "learning_rate": 9.92960829402993e-06, + "loss": 0.6865, + "step": 1068 + }, + { + "epoch": 0.7413314840499307, + "grad_norm": 0.34924981598093613, + "learning_rate": 9.92920295508798e-06, + "loss": 0.5805, + "step": 1069 + }, + { + "epoch": 0.7420249653259362, + "grad_norm": 0.3312931346094807, + "learning_rate": 9.928796460776558e-06, + "loss": 0.567, + "step": 1070 + }, + { + "epoch": 0.7427184466019418, + "grad_norm": 0.37359412047424895, + "learning_rate": 9.928388811190938e-06, + "loss": 0.5479, + "step": 1071 + }, + { + "epoch": 0.7434119278779473, + "grad_norm": 0.361243689595125, + "learning_rate": 9.927980006426677e-06, + "loss": 0.6182, + "step": 1072 + }, + { + "epoch": 0.7441054091539528, + "grad_norm": 0.3694364466931621, + "learning_rate": 9.927570046579591e-06, + "loss": 0.6626, + "step": 1073 + }, + { + "epoch": 0.7447988904299584, + "grad_norm": 0.3485816922611258, + "learning_rate": 9.927158931745775e-06, + "loss": 0.6727, + "step": 1074 + }, + { + "epoch": 0.7454923717059639, + "grad_norm": 0.34157275189351793, + "learning_rate": 9.926746662021589e-06, + "loss": 0.589, + "step": 1075 + }, + { + "epoch": 0.7461858529819695, + "grad_norm": 0.38194584535428683, + "learning_rate": 9.926333237503665e-06, + "loss": 0.6034, + "step": 1076 + }, + { + "epoch": 0.746879334257975, + "grad_norm": 0.35593483853898966, + "learning_rate": 9.92591865828891e-06, + "loss": 0.644, + "step": 1077 + }, + { + "epoch": 0.7475728155339806, + "grad_norm": 0.3626096624134651, + "learning_rate": 9.925502924474495e-06, + "loss": 0.5897, + "step": 1078 + }, + { + "epoch": 0.7482662968099861, + "grad_norm": 0.3720710472905889, + "learning_rate": 9.925086036157869e-06, + "loss": 0.6038, + "step": 1079 + }, + { + "epoch": 0.7489597780859917, + "grad_norm": 0.33045752040041915, + "learning_rate": 9.924667993436742e-06, + "loss": 0.6144, + "step": 1080 + }, + { + "epoch": 0.7496532593619972, + "grad_norm": 0.3330811925678463, + "learning_rate": 9.924248796409107e-06, + "loss": 0.5171, + "step": 1081 + }, + { + "epoch": 0.7503467406380028, + "grad_norm": 0.356313244135962, + "learning_rate": 9.923828445173215e-06, + "loss": 0.6731, + "step": 1082 + }, + { + "epoch": 0.7510402219140083, + "grad_norm": 0.34526196714865764, + "learning_rate": 9.923406939827596e-06, + "loss": 0.6021, + "step": 1083 + }, + { + "epoch": 0.7517337031900139, + "grad_norm": 0.3186655600344347, + "learning_rate": 9.922984280471048e-06, + "loss": 0.5589, + "step": 1084 + }, + { + "epoch": 0.7524271844660194, + "grad_norm": 0.3365046214183426, + "learning_rate": 9.922560467202638e-06, + "loss": 0.5753, + "step": 1085 + }, + { + "epoch": 0.753120665742025, + "grad_norm": 0.3807776922687547, + "learning_rate": 9.922135500121705e-06, + "loss": 0.5785, + "step": 1086 + }, + { + "epoch": 0.7538141470180305, + "grad_norm": 0.3310493131741956, + "learning_rate": 9.921709379327859e-06, + "loss": 0.5577, + "step": 1087 + }, + { + "epoch": 0.7545076282940361, + "grad_norm": 0.4136156302344913, + "learning_rate": 9.92128210492098e-06, + "loss": 0.6314, + "step": 1088 + }, + { + "epoch": 0.7552011095700416, + "grad_norm": 0.37631472966269475, + "learning_rate": 9.920853677001215e-06, + "loss": 0.6565, + "step": 1089 + }, + { + "epoch": 0.7558945908460472, + "grad_norm": 0.6445027382242253, + "learning_rate": 9.920424095668988e-06, + "loss": 0.6184, + "step": 1090 + }, + { + "epoch": 0.7565880721220527, + "grad_norm": 0.350976316901039, + "learning_rate": 9.919993361024989e-06, + "loss": 0.5619, + "step": 1091 + }, + { + "epoch": 0.7572815533980582, + "grad_norm": 0.31328935691242954, + "learning_rate": 9.919561473170178e-06, + "loss": 0.5855, + "step": 1092 + }, + { + "epoch": 0.7579750346740638, + "grad_norm": 0.3221338932383943, + "learning_rate": 9.919128432205786e-06, + "loss": 0.5937, + "step": 1093 + }, + { + "epoch": 0.7586685159500693, + "grad_norm": 0.33196246372235766, + "learning_rate": 9.918694238233314e-06, + "loss": 0.6027, + "step": 1094 + }, + { + "epoch": 0.7593619972260749, + "grad_norm": 0.3942666403658931, + "learning_rate": 9.91825889135454e-06, + "loss": 0.6452, + "step": 1095 + }, + { + "epoch": 0.7600554785020804, + "grad_norm": 0.3675180175724699, + "learning_rate": 9.9178223916715e-06, + "loss": 0.5824, + "step": 1096 + }, + { + "epoch": 0.760748959778086, + "grad_norm": 0.3032022133924184, + "learning_rate": 9.917384739286505e-06, + "loss": 0.5887, + "step": 1097 + }, + { + "epoch": 0.7614424410540915, + "grad_norm": 0.33094285704358667, + "learning_rate": 9.916945934302142e-06, + "loss": 0.5337, + "step": 1098 + }, + { + "epoch": 0.7621359223300971, + "grad_norm": 0.3250037259073846, + "learning_rate": 9.916505976821262e-06, + "loss": 0.5563, + "step": 1099 + }, + { + "epoch": 0.7628294036061026, + "grad_norm": 0.3698602118076207, + "learning_rate": 9.91606486694699e-06, + "loss": 0.5598, + "step": 1100 + }, + { + "epoch": 0.7635228848821082, + "grad_norm": 0.35241780322083244, + "learning_rate": 9.915622604782716e-06, + "loss": 0.5633, + "step": 1101 + }, + { + "epoch": 0.7642163661581137, + "grad_norm": 0.3404720849529966, + "learning_rate": 9.915179190432102e-06, + "loss": 0.5658, + "step": 1102 + }, + { + "epoch": 0.7649098474341193, + "grad_norm": 0.34218980207078303, + "learning_rate": 9.914734623999086e-06, + "loss": 0.5564, + "step": 1103 + }, + { + "epoch": 0.7656033287101248, + "grad_norm": 0.34919546385510847, + "learning_rate": 9.914288905587867e-06, + "loss": 0.4655, + "step": 1104 + }, + { + "epoch": 0.7662968099861304, + "grad_norm": 0.36238942723055884, + "learning_rate": 9.91384203530292e-06, + "loss": 0.6234, + "step": 1105 + }, + { + "epoch": 0.7669902912621359, + "grad_norm": 0.3483550241642675, + "learning_rate": 9.913394013248987e-06, + "loss": 0.6375, + "step": 1106 + }, + { + "epoch": 0.7676837725381415, + "grad_norm": 0.31474225103999814, + "learning_rate": 9.912944839531083e-06, + "loss": 0.5282, + "step": 1107 + }, + { + "epoch": 0.768377253814147, + "grad_norm": 0.3200443820947016, + "learning_rate": 9.912494514254487e-06, + "loss": 0.6036, + "step": 1108 + }, + { + "epoch": 0.7690707350901526, + "grad_norm": 0.3229008835087235, + "learning_rate": 9.912043037524758e-06, + "loss": 0.5547, + "step": 1109 + }, + { + "epoch": 0.7697642163661581, + "grad_norm": 0.3165451146540614, + "learning_rate": 9.911590409447713e-06, + "loss": 0.5463, + "step": 1110 + }, + { + "epoch": 0.7704576976421637, + "grad_norm": 0.33748946916856115, + "learning_rate": 9.91113663012945e-06, + "loss": 0.5923, + "step": 1111 + }, + { + "epoch": 0.7711511789181692, + "grad_norm": 0.32709168605333655, + "learning_rate": 9.910681699676327e-06, + "loss": 0.5438, + "step": 1112 + }, + { + "epoch": 0.7718446601941747, + "grad_norm": 0.33964568054254574, + "learning_rate": 9.91022561819498e-06, + "loss": 0.5095, + "step": 1113 + }, + { + "epoch": 0.7725381414701803, + "grad_norm": 0.3448286501208917, + "learning_rate": 9.909768385792308e-06, + "loss": 0.5605, + "step": 1114 + }, + { + "epoch": 0.7732316227461858, + "grad_norm": 0.3286078533295107, + "learning_rate": 9.909310002575486e-06, + "loss": 0.5318, + "step": 1115 + }, + { + "epoch": 0.7739251040221914, + "grad_norm": 0.3366454563267662, + "learning_rate": 9.908850468651953e-06, + "loss": 0.58, + "step": 1116 + }, + { + "epoch": 0.7746185852981969, + "grad_norm": 0.46236589261179867, + "learning_rate": 9.908389784129424e-06, + "loss": 0.5807, + "step": 1117 + }, + { + "epoch": 0.7753120665742025, + "grad_norm": 0.3524473088468238, + "learning_rate": 9.907927949115877e-06, + "loss": 0.5745, + "step": 1118 + }, + { + "epoch": 0.776005547850208, + "grad_norm": 0.3157956125887099, + "learning_rate": 9.907464963719562e-06, + "loss": 0.5167, + "step": 1119 + }, + { + "epoch": 0.7766990291262136, + "grad_norm": 0.33920527726146843, + "learning_rate": 9.907000828049001e-06, + "loss": 0.5183, + "step": 1120 + }, + { + "epoch": 0.7773925104022191, + "grad_norm": 0.3571934309252906, + "learning_rate": 9.906535542212984e-06, + "loss": 0.5849, + "step": 1121 + }, + { + "epoch": 0.7780859916782247, + "grad_norm": 0.34341155887390523, + "learning_rate": 9.906069106320573e-06, + "loss": 0.6189, + "step": 1122 + }, + { + "epoch": 0.7787794729542302, + "grad_norm": 0.36803861258608084, + "learning_rate": 9.905601520481094e-06, + "loss": 0.5567, + "step": 1123 + }, + { + "epoch": 0.7794729542302358, + "grad_norm": 0.33411631641875295, + "learning_rate": 9.905132784804146e-06, + "loss": 0.5717, + "step": 1124 + }, + { + "epoch": 0.7801664355062413, + "grad_norm": 0.3749463095695367, + "learning_rate": 9.904662899399598e-06, + "loss": 0.5764, + "step": 1125 + }, + { + "epoch": 0.7808599167822469, + "grad_norm": 0.3411261494102941, + "learning_rate": 9.904191864377588e-06, + "loss": 0.5028, + "step": 1126 + }, + { + "epoch": 0.7815533980582524, + "grad_norm": 0.313940560560632, + "learning_rate": 9.903719679848522e-06, + "loss": 0.5555, + "step": 1127 + }, + { + "epoch": 0.782246879334258, + "grad_norm": 0.6566302794869116, + "learning_rate": 9.903246345923078e-06, + "loss": 0.5844, + "step": 1128 + }, + { + "epoch": 0.7829403606102635, + "grad_norm": 0.3347037281216503, + "learning_rate": 9.902771862712201e-06, + "loss": 0.5831, + "step": 1129 + }, + { + "epoch": 0.7836338418862691, + "grad_norm": 0.41623786574982496, + "learning_rate": 9.902296230327109e-06, + "loss": 0.6246, + "step": 1130 + }, + { + "epoch": 0.7843273231622746, + "grad_norm": 0.3297323734484108, + "learning_rate": 9.901819448879284e-06, + "loss": 0.5644, + "step": 1131 + }, + { + "epoch": 0.7850208044382802, + "grad_norm": 0.30955004662696883, + "learning_rate": 9.901341518480478e-06, + "loss": 0.5616, + "step": 1132 + }, + { + "epoch": 0.7857142857142857, + "grad_norm": 0.3235277563056244, + "learning_rate": 9.900862439242719e-06, + "loss": 0.6166, + "step": 1133 + }, + { + "epoch": 0.7864077669902912, + "grad_norm": 0.31748454405852916, + "learning_rate": 9.9003822112783e-06, + "loss": 0.5235, + "step": 1134 + }, + { + "epoch": 0.7871012482662968, + "grad_norm": 0.33023689686714086, + "learning_rate": 9.899900834699778e-06, + "loss": 0.5849, + "step": 1135 + }, + { + "epoch": 0.7877947295423023, + "grad_norm": 0.3495578780899382, + "learning_rate": 9.899418309619988e-06, + "loss": 0.6115, + "step": 1136 + }, + { + "epoch": 0.7884882108183079, + "grad_norm": 0.32497385026555636, + "learning_rate": 9.89893463615203e-06, + "loss": 0.5123, + "step": 1137 + }, + { + "epoch": 0.7891816920943134, + "grad_norm": 0.31823531157818197, + "learning_rate": 9.898449814409272e-06, + "loss": 0.5472, + "step": 1138 + }, + { + "epoch": 0.789875173370319, + "grad_norm": 0.3871723185910273, + "learning_rate": 9.897963844505355e-06, + "loss": 0.6342, + "step": 1139 + }, + { + "epoch": 0.7905686546463245, + "grad_norm": 0.32748688538451093, + "learning_rate": 9.897476726554185e-06, + "loss": 0.6139, + "step": 1140 + }, + { + "epoch": 0.7912621359223301, + "grad_norm": 0.35033793604159597, + "learning_rate": 9.89698846066994e-06, + "loss": 0.6325, + "step": 1141 + }, + { + "epoch": 0.7919556171983356, + "grad_norm": 0.3487389698279109, + "learning_rate": 9.896499046967065e-06, + "loss": 0.6317, + "step": 1142 + }, + { + "epoch": 0.7926490984743412, + "grad_norm": 0.3531649178211908, + "learning_rate": 9.896008485560275e-06, + "loss": 0.5557, + "step": 1143 + }, + { + "epoch": 0.7933425797503467, + "grad_norm": 1.0305191170454497, + "learning_rate": 9.895516776564555e-06, + "loss": 0.5577, + "step": 1144 + }, + { + "epoch": 0.7940360610263523, + "grad_norm": 0.32998484447400867, + "learning_rate": 9.895023920095157e-06, + "loss": 0.5486, + "step": 1145 + }, + { + "epoch": 0.7947295423023578, + "grad_norm": 0.3619027933913946, + "learning_rate": 9.894529916267605e-06, + "loss": 0.5674, + "step": 1146 + }, + { + "epoch": 0.7954230235783634, + "grad_norm": 0.3511914223909972, + "learning_rate": 9.894034765197688e-06, + "loss": 0.6188, + "step": 1147 + }, + { + "epoch": 0.7961165048543689, + "grad_norm": 0.3526373782570898, + "learning_rate": 9.893538467001466e-06, + "loss": 0.6216, + "step": 1148 + }, + { + "epoch": 0.7968099861303745, + "grad_norm": 0.3573519780274059, + "learning_rate": 9.893041021795266e-06, + "loss": 0.5697, + "step": 1149 + }, + { + "epoch": 0.79750346740638, + "grad_norm": 0.3369604374861474, + "learning_rate": 9.892542429695691e-06, + "loss": 0.5902, + "step": 1150 + }, + { + "epoch": 0.7981969486823856, + "grad_norm": 0.3706997741580729, + "learning_rate": 9.892042690819602e-06, + "loss": 0.5412, + "step": 1151 + }, + { + "epoch": 0.7988904299583911, + "grad_norm": 0.305468979726182, + "learning_rate": 9.891541805284137e-06, + "loss": 0.4956, + "step": 1152 + }, + { + "epoch": 0.7995839112343966, + "grad_norm": 0.32809836169628454, + "learning_rate": 9.891039773206698e-06, + "loss": 0.5511, + "step": 1153 + }, + { + "epoch": 0.8002773925104022, + "grad_norm": 0.3405183403533073, + "learning_rate": 9.890536594704961e-06, + "loss": 0.536, + "step": 1154 + }, + { + "epoch": 0.8009708737864077, + "grad_norm": 0.32633246107874186, + "learning_rate": 9.890032269896862e-06, + "loss": 0.5373, + "step": 1155 + }, + { + "epoch": 0.8016643550624133, + "grad_norm": 0.34040614751990844, + "learning_rate": 9.889526798900615e-06, + "loss": 0.5423, + "step": 1156 + }, + { + "epoch": 0.8023578363384188, + "grad_norm": 0.33871526192231677, + "learning_rate": 9.889020181834698e-06, + "loss": 0.6379, + "step": 1157 + }, + { + "epoch": 0.8030513176144244, + "grad_norm": 0.34929270754374225, + "learning_rate": 9.888512418817861e-06, + "loss": 0.5281, + "step": 1158 + }, + { + "epoch": 0.8037447988904299, + "grad_norm": 0.3740202942568168, + "learning_rate": 9.888003509969116e-06, + "loss": 0.5771, + "step": 1159 + }, + { + "epoch": 0.8044382801664355, + "grad_norm": 0.4620528126667381, + "learning_rate": 9.887493455407746e-06, + "loss": 0.587, + "step": 1160 + }, + { + "epoch": 0.805131761442441, + "grad_norm": 0.2986631872001357, + "learning_rate": 9.88698225525331e-06, + "loss": 0.5042, + "step": 1161 + }, + { + "epoch": 0.8058252427184466, + "grad_norm": 0.32733982878285756, + "learning_rate": 9.886469909625624e-06, + "loss": 0.5278, + "step": 1162 + }, + { + "epoch": 0.8065187239944521, + "grad_norm": 0.3208486870317828, + "learning_rate": 9.885956418644783e-06, + "loss": 0.5399, + "step": 1163 + }, + { + "epoch": 0.8072122052704577, + "grad_norm": 0.35713225551264444, + "learning_rate": 9.885441782431143e-06, + "loss": 0.6282, + "step": 1164 + }, + { + "epoch": 0.8079056865464632, + "grad_norm": 0.3534688004612578, + "learning_rate": 9.884926001105331e-06, + "loss": 0.61, + "step": 1165 + }, + { + "epoch": 0.8085991678224688, + "grad_norm": 0.3573246841498351, + "learning_rate": 9.884409074788242e-06, + "loss": 0.5765, + "step": 1166 + }, + { + "epoch": 0.8092926490984743, + "grad_norm": 0.3542712347958411, + "learning_rate": 9.883891003601041e-06, + "loss": 0.5989, + "step": 1167 + }, + { + "epoch": 0.8099861303744799, + "grad_norm": 0.3472953991002415, + "learning_rate": 9.883371787665158e-06, + "loss": 0.6198, + "step": 1168 + }, + { + "epoch": 0.8106796116504854, + "grad_norm": 0.32420034756459143, + "learning_rate": 9.882851427102299e-06, + "loss": 0.5682, + "step": 1169 + }, + { + "epoch": 0.811373092926491, + "grad_norm": 0.3307306065177549, + "learning_rate": 9.882329922034424e-06, + "loss": 0.5373, + "step": 1170 + }, + { + "epoch": 0.8120665742024965, + "grad_norm": 0.3118736702940442, + "learning_rate": 9.881807272583776e-06, + "loss": 0.5125, + "step": 1171 + }, + { + "epoch": 0.812760055478502, + "grad_norm": 0.3459557102540757, + "learning_rate": 9.88128347887286e-06, + "loss": 0.644, + "step": 1172 + }, + { + "epoch": 0.8134535367545076, + "grad_norm": 0.34330909907101614, + "learning_rate": 9.880758541024449e-06, + "loss": 0.5741, + "step": 1173 + }, + { + "epoch": 0.8141470180305131, + "grad_norm": 0.3261296168661891, + "learning_rate": 9.880232459161583e-06, + "loss": 0.5514, + "step": 1174 + }, + { + "epoch": 0.8148404993065187, + "grad_norm": 0.32032462633009595, + "learning_rate": 9.879705233407576e-06, + "loss": 0.5293, + "step": 1175 + }, + { + "epoch": 0.8155339805825242, + "grad_norm": 0.3779710108286491, + "learning_rate": 9.879176863885997e-06, + "loss": 0.5532, + "step": 1176 + }, + { + "epoch": 0.8162274618585298, + "grad_norm": 0.3467233546272369, + "learning_rate": 9.878647350720703e-06, + "loss": 0.5986, + "step": 1177 + }, + { + "epoch": 0.8169209431345353, + "grad_norm": 0.3561514472980493, + "learning_rate": 9.8781166940358e-06, + "loss": 0.6511, + "step": 1178 + }, + { + "epoch": 0.8176144244105409, + "grad_norm": 0.36134302910341937, + "learning_rate": 9.877584893955674e-06, + "loss": 0.5211, + "step": 1179 + }, + { + "epoch": 0.8183079056865464, + "grad_norm": 0.3158580471200914, + "learning_rate": 9.877051950604972e-06, + "loss": 0.5711, + "step": 1180 + }, + { + "epoch": 0.819001386962552, + "grad_norm": 0.3613864439699286, + "learning_rate": 9.876517864108617e-06, + "loss": 0.5703, + "step": 1181 + }, + { + "epoch": 0.8196948682385575, + "grad_norm": 0.3075357148573775, + "learning_rate": 9.87598263459179e-06, + "loss": 0.51, + "step": 1182 + }, + { + "epoch": 0.8203883495145631, + "grad_norm": 0.3188142632127063, + "learning_rate": 9.875446262179948e-06, + "loss": 0.5379, + "step": 1183 + }, + { + "epoch": 0.8210818307905686, + "grad_norm": 0.3767995481374369, + "learning_rate": 9.874908746998811e-06, + "loss": 0.5949, + "step": 1184 + }, + { + "epoch": 0.8217753120665742, + "grad_norm": 0.32095775879686966, + "learning_rate": 9.87437008917437e-06, + "loss": 0.5404, + "step": 1185 + }, + { + "epoch": 0.8224687933425797, + "grad_norm": 0.3576115548174324, + "learning_rate": 9.873830288832882e-06, + "loss": 0.5634, + "step": 1186 + }, + { + "epoch": 0.8231622746185853, + "grad_norm": 0.3422084150460348, + "learning_rate": 9.873289346100872e-06, + "loss": 0.5852, + "step": 1187 + }, + { + "epoch": 0.8238557558945908, + "grad_norm": 0.3222649805066783, + "learning_rate": 9.872747261105133e-06, + "loss": 0.5463, + "step": 1188 + }, + { + "epoch": 0.8245492371705964, + "grad_norm": 0.3496908260537203, + "learning_rate": 9.872204033972727e-06, + "loss": 0.5205, + "step": 1189 + }, + { + "epoch": 0.8252427184466019, + "grad_norm": 0.3388360737763813, + "learning_rate": 9.87165966483098e-06, + "loss": 0.5909, + "step": 1190 + }, + { + "epoch": 0.8259361997226075, + "grad_norm": 0.3236104661748136, + "learning_rate": 9.871114153807491e-06, + "loss": 0.5578, + "step": 1191 + }, + { + "epoch": 0.826629680998613, + "grad_norm": 0.39015592746291056, + "learning_rate": 9.870567501030122e-06, + "loss": 0.6206, + "step": 1192 + }, + { + "epoch": 0.8273231622746186, + "grad_norm": 0.33132849595283714, + "learning_rate": 9.870019706627006e-06, + "loss": 0.6117, + "step": 1193 + }, + { + "epoch": 0.8280166435506241, + "grad_norm": 0.35448745194396175, + "learning_rate": 9.869470770726541e-06, + "loss": 0.5997, + "step": 1194 + }, + { + "epoch": 0.8287101248266296, + "grad_norm": 0.3208479735558261, + "learning_rate": 9.868920693457393e-06, + "loss": 0.5917, + "step": 1195 + }, + { + "epoch": 0.8294036061026352, + "grad_norm": 0.3324491514388112, + "learning_rate": 9.868369474948498e-06, + "loss": 0.5659, + "step": 1196 + }, + { + "epoch": 0.8300970873786407, + "grad_norm": 0.3667633479287624, + "learning_rate": 9.867817115329055e-06, + "loss": 0.605, + "step": 1197 + }, + { + "epoch": 0.8307905686546463, + "grad_norm": 0.319951425929113, + "learning_rate": 9.867263614728535e-06, + "loss": 0.5668, + "step": 1198 + }, + { + "epoch": 0.8314840499306518, + "grad_norm": 0.3219623411274206, + "learning_rate": 9.866708973276674e-06, + "loss": 0.5381, + "step": 1199 + }, + { + "epoch": 0.8321775312066574, + "grad_norm": 0.33814181704870083, + "learning_rate": 9.866153191103476e-06, + "loss": 0.5031, + "step": 1200 + }, + { + "epoch": 0.8328710124826629, + "grad_norm": 0.3574079179689098, + "learning_rate": 9.865596268339213e-06, + "loss": 0.5947, + "step": 1201 + }, + { + "epoch": 0.8335644937586685, + "grad_norm": 0.344780139859224, + "learning_rate": 9.865038205114422e-06, + "loss": 0.6166, + "step": 1202 + }, + { + "epoch": 0.834257975034674, + "grad_norm": 0.32138095499949426, + "learning_rate": 9.86447900155991e-06, + "loss": 0.5854, + "step": 1203 + }, + { + "epoch": 0.8349514563106796, + "grad_norm": 0.32251090179520675, + "learning_rate": 9.863918657806752e-06, + "loss": 0.5606, + "step": 1204 + }, + { + "epoch": 0.8356449375866851, + "grad_norm": 0.31744573456007175, + "learning_rate": 9.863357173986285e-06, + "loss": 0.5706, + "step": 1205 + }, + { + "epoch": 0.8363384188626907, + "grad_norm": 0.3258706406679843, + "learning_rate": 9.862794550230119e-06, + "loss": 0.5624, + "step": 1206 + }, + { + "epoch": 0.8370319001386962, + "grad_norm": 0.37917664859412087, + "learning_rate": 9.862230786670129e-06, + "loss": 0.5854, + "step": 1207 + }, + { + "epoch": 0.8377253814147018, + "grad_norm": 0.3712638165551654, + "learning_rate": 9.861665883438456e-06, + "loss": 0.5972, + "step": 1208 + }, + { + "epoch": 0.8384188626907073, + "grad_norm": 0.45810505441807553, + "learning_rate": 9.86109984066751e-06, + "loss": 0.6128, + "step": 1209 + }, + { + "epoch": 0.8391123439667129, + "grad_norm": 0.6036613407606876, + "learning_rate": 9.860532658489967e-06, + "loss": 0.5489, + "step": 1210 + }, + { + "epoch": 0.8398058252427184, + "grad_norm": 0.33020040403158707, + "learning_rate": 9.85996433703877e-06, + "loss": 0.5825, + "step": 1211 + }, + { + "epoch": 0.840499306518724, + "grad_norm": 0.3621560256794954, + "learning_rate": 9.85939487644713e-06, + "loss": 0.5318, + "step": 1212 + }, + { + "epoch": 0.8411927877947295, + "grad_norm": 0.34977264380135165, + "learning_rate": 9.858824276848524e-06, + "loss": 0.6136, + "step": 1213 + }, + { + "epoch": 0.841886269070735, + "grad_norm": 0.32135164908334707, + "learning_rate": 9.858252538376698e-06, + "loss": 0.5025, + "step": 1214 + }, + { + "epoch": 0.8425797503467406, + "grad_norm": 0.3898124430815325, + "learning_rate": 9.857679661165663e-06, + "loss": 0.5533, + "step": 1215 + }, + { + "epoch": 0.8432732316227461, + "grad_norm": 0.3600086857034731, + "learning_rate": 9.857105645349694e-06, + "loss": 0.579, + "step": 1216 + }, + { + "epoch": 0.8439667128987517, + "grad_norm": 0.3901900266989973, + "learning_rate": 9.856530491063338e-06, + "loss": 0.5646, + "step": 1217 + }, + { + "epoch": 0.8446601941747572, + "grad_norm": 0.3665341133398684, + "learning_rate": 9.855954198441411e-06, + "loss": 0.6373, + "step": 1218 + }, + { + "epoch": 0.8453536754507628, + "grad_norm": 0.33695444682139386, + "learning_rate": 9.855376767618985e-06, + "loss": 0.5542, + "step": 1219 + }, + { + "epoch": 0.8460471567267683, + "grad_norm": 0.33689360622798203, + "learning_rate": 9.854798198731411e-06, + "loss": 0.6371, + "step": 1220 + }, + { + "epoch": 0.8467406380027739, + "grad_norm": 0.3261138900491812, + "learning_rate": 9.854218491914298e-06, + "loss": 0.477, + "step": 1221 + }, + { + "epoch": 0.8474341192787794, + "grad_norm": 0.30686493763733774, + "learning_rate": 9.853637647303528e-06, + "loss": 0.4961, + "step": 1222 + }, + { + "epoch": 0.848127600554785, + "grad_norm": 0.326983557469853, + "learning_rate": 9.853055665035244e-06, + "loss": 0.5263, + "step": 1223 + }, + { + "epoch": 0.8488210818307905, + "grad_norm": 0.3839857778727524, + "learning_rate": 9.85247254524586e-06, + "loss": 0.5942, + "step": 1224 + }, + { + "epoch": 0.8495145631067961, + "grad_norm": 0.36238512304436643, + "learning_rate": 9.851888288072053e-06, + "loss": 0.5472, + "step": 1225 + }, + { + "epoch": 0.8502080443828016, + "grad_norm": 0.3328601800683192, + "learning_rate": 9.851302893650773e-06, + "loss": 0.5815, + "step": 1226 + }, + { + "epoch": 0.8509015256588072, + "grad_norm": 0.31869695935851317, + "learning_rate": 9.850716362119229e-06, + "loss": 0.5511, + "step": 1227 + }, + { + "epoch": 0.8515950069348127, + "grad_norm": 0.3218992384439522, + "learning_rate": 9.850128693614898e-06, + "loss": 0.5431, + "step": 1228 + }, + { + "epoch": 0.8522884882108183, + "grad_norm": 0.3075557984826137, + "learning_rate": 9.84953988827553e-06, + "loss": 0.5532, + "step": 1229 + }, + { + "epoch": 0.8529819694868238, + "grad_norm": 0.32272884303113597, + "learning_rate": 9.848949946239132e-06, + "loss": 0.6061, + "step": 1230 + }, + { + "epoch": 0.8536754507628294, + "grad_norm": 0.3288623926118559, + "learning_rate": 9.848358867643985e-06, + "loss": 0.4792, + "step": 1231 + }, + { + "epoch": 0.8543689320388349, + "grad_norm": 0.3610242978031061, + "learning_rate": 9.847766652628635e-06, + "loss": 0.6134, + "step": 1232 + }, + { + "epoch": 0.8550624133148405, + "grad_norm": 0.3141108964836117, + "learning_rate": 9.847173301331889e-06, + "loss": 0.543, + "step": 1233 + }, + { + "epoch": 0.855755894590846, + "grad_norm": 0.3712592727169921, + "learning_rate": 9.846578813892827e-06, + "loss": 0.5927, + "step": 1234 + }, + { + "epoch": 0.8564493758668515, + "grad_norm": 0.3422545784436403, + "learning_rate": 9.84598319045079e-06, + "loss": 0.5545, + "step": 1235 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.34192867170626523, + "learning_rate": 9.84538643114539e-06, + "loss": 0.614, + "step": 1236 + }, + { + "epoch": 0.8578363384188626, + "grad_norm": 0.34078198263330395, + "learning_rate": 9.844788536116504e-06, + "loss": 0.5408, + "step": 1237 + }, + { + "epoch": 0.8585298196948682, + "grad_norm": 0.3189641682226928, + "learning_rate": 9.844189505504272e-06, + "loss": 0.5322, + "step": 1238 + }, + { + "epoch": 0.8592233009708737, + "grad_norm": 0.3186152647314188, + "learning_rate": 9.843589339449102e-06, + "loss": 0.5346, + "step": 1239 + }, + { + "epoch": 0.8599167822468793, + "grad_norm": 0.3267157115057452, + "learning_rate": 9.84298803809167e-06, + "loss": 0.5719, + "step": 1240 + }, + { + "epoch": 0.8606102635228848, + "grad_norm": 0.3245630397273561, + "learning_rate": 9.842385601572918e-06, + "loss": 0.5165, + "step": 1241 + }, + { + "epoch": 0.8613037447988904, + "grad_norm": 0.3910276769829623, + "learning_rate": 9.841782030034049e-06, + "loss": 0.6059, + "step": 1242 + }, + { + "epoch": 0.8619972260748959, + "grad_norm": 0.4074256777033721, + "learning_rate": 9.841177323616539e-06, + "loss": 0.5965, + "step": 1243 + }, + { + "epoch": 0.8626907073509015, + "grad_norm": 0.34814973925935633, + "learning_rate": 9.840571482462126e-06, + "loss": 0.5365, + "step": 1244 + }, + { + "epoch": 0.863384188626907, + "grad_norm": 0.34036803058568826, + "learning_rate": 9.839964506712814e-06, + "loss": 0.561, + "step": 1245 + }, + { + "epoch": 0.8640776699029126, + "grad_norm": 0.341389026646717, + "learning_rate": 9.839356396510875e-06, + "loss": 0.5451, + "step": 1246 + }, + { + "epoch": 0.8647711511789181, + "grad_norm": 0.2907921324032125, + "learning_rate": 9.838747151998844e-06, + "loss": 0.4932, + "step": 1247 + }, + { + "epoch": 0.8654646324549237, + "grad_norm": 0.3127236294467869, + "learning_rate": 9.838136773319527e-06, + "loss": 0.5163, + "step": 1248 + }, + { + "epoch": 0.8661581137309292, + "grad_norm": 0.34454414991532545, + "learning_rate": 9.837525260615987e-06, + "loss": 0.5189, + "step": 1249 + }, + { + "epoch": 0.8668515950069348, + "grad_norm": 0.36058454065948325, + "learning_rate": 9.836912614031561e-06, + "loss": 0.6312, + "step": 1250 + }, + { + "epoch": 0.8675450762829403, + "grad_norm": 0.34673590336860904, + "learning_rate": 9.83629883370985e-06, + "loss": 0.5893, + "step": 1251 + }, + { + "epoch": 0.8682385575589459, + "grad_norm": 0.33511794703223685, + "learning_rate": 9.835683919794719e-06, + "loss": 0.5658, + "step": 1252 + }, + { + "epoch": 0.8689320388349514, + "grad_norm": 0.3210331871920328, + "learning_rate": 9.835067872430297e-06, + "loss": 0.6333, + "step": 1253 + }, + { + "epoch": 0.869625520110957, + "grad_norm": 0.33470113519608913, + "learning_rate": 9.834450691760983e-06, + "loss": 0.5474, + "step": 1254 + }, + { + "epoch": 0.8703190013869625, + "grad_norm": 0.32831830495224923, + "learning_rate": 9.833832377931442e-06, + "loss": 0.5015, + "step": 1255 + }, + { + "epoch": 0.871012482662968, + "grad_norm": 0.3633007768286215, + "learning_rate": 9.833212931086597e-06, + "loss": 0.5702, + "step": 1256 + }, + { + "epoch": 0.8717059639389736, + "grad_norm": 0.3357210640028075, + "learning_rate": 9.832592351371646e-06, + "loss": 0.5413, + "step": 1257 + }, + { + "epoch": 0.8723994452149791, + "grad_norm": 0.35920255546669794, + "learning_rate": 9.831970638932048e-06, + "loss": 0.6591, + "step": 1258 + }, + { + "epoch": 0.8730929264909847, + "grad_norm": 0.35116338869790564, + "learning_rate": 9.831347793913526e-06, + "loss": 0.603, + "step": 1259 + }, + { + "epoch": 0.8737864077669902, + "grad_norm": 0.332515881910525, + "learning_rate": 9.830723816462071e-06, + "loss": 0.5354, + "step": 1260 + }, + { + "epoch": 0.8744798890429958, + "grad_norm": 0.354597740102601, + "learning_rate": 9.83009870672394e-06, + "loss": 0.5938, + "step": 1261 + }, + { + "epoch": 0.8751733703190014, + "grad_norm": 0.3339340280415721, + "learning_rate": 9.829472464845654e-06, + "loss": 0.5307, + "step": 1262 + }, + { + "epoch": 0.875866851595007, + "grad_norm": 0.3531606867195477, + "learning_rate": 9.828845090973998e-06, + "loss": 0.5673, + "step": 1263 + }, + { + "epoch": 0.8765603328710125, + "grad_norm": 0.31848797118696937, + "learning_rate": 9.828216585256025e-06, + "loss": 0.5333, + "step": 1264 + }, + { + "epoch": 0.8772538141470181, + "grad_norm": 0.329104246002548, + "learning_rate": 9.827586947839052e-06, + "loss": 0.5743, + "step": 1265 + }, + { + "epoch": 0.8779472954230236, + "grad_norm": 0.3287601918052048, + "learning_rate": 9.826956178870662e-06, + "loss": 0.5709, + "step": 1266 + }, + { + "epoch": 0.8786407766990292, + "grad_norm": 0.3411401138107286, + "learning_rate": 9.8263242784987e-06, + "loss": 0.5621, + "step": 1267 + }, + { + "epoch": 0.8793342579750347, + "grad_norm": 0.3500515446990955, + "learning_rate": 9.825691246871283e-06, + "loss": 0.5652, + "step": 1268 + }, + { + "epoch": 0.8800277392510403, + "grad_norm": 0.3501616261287641, + "learning_rate": 9.825057084136786e-06, + "loss": 0.5716, + "step": 1269 + }, + { + "epoch": 0.8807212205270458, + "grad_norm": 0.3249135004019094, + "learning_rate": 9.824421790443855e-06, + "loss": 0.6303, + "step": 1270 + }, + { + "epoch": 0.8814147018030514, + "grad_norm": 0.3402407337272337, + "learning_rate": 9.823785365941394e-06, + "loss": 0.5482, + "step": 1271 + }, + { + "epoch": 0.8821081830790569, + "grad_norm": 0.34989503534717264, + "learning_rate": 9.82314781077858e-06, + "loss": 0.5265, + "step": 1272 + }, + { + "epoch": 0.8828016643550625, + "grad_norm": 0.33234210738171654, + "learning_rate": 9.82250912510485e-06, + "loss": 0.5606, + "step": 1273 + }, + { + "epoch": 0.883495145631068, + "grad_norm": 0.3127730100828181, + "learning_rate": 9.821869309069907e-06, + "loss": 0.5963, + "step": 1274 + }, + { + "epoch": 0.8841886269070736, + "grad_norm": 0.3484755831995452, + "learning_rate": 9.821228362823719e-06, + "loss": 0.5432, + "step": 1275 + }, + { + "epoch": 0.8848821081830791, + "grad_norm": 0.36161292333184586, + "learning_rate": 9.82058628651652e-06, + "loss": 0.6208, + "step": 1276 + }, + { + "epoch": 0.8855755894590847, + "grad_norm": 0.3181656179118097, + "learning_rate": 9.819943080298808e-06, + "loss": 0.5685, + "step": 1277 + }, + { + "epoch": 0.8862690707350902, + "grad_norm": 0.34746536626784236, + "learning_rate": 9.819298744321346e-06, + "loss": 0.5881, + "step": 1278 + }, + { + "epoch": 0.8869625520110958, + "grad_norm": 0.3541273059375999, + "learning_rate": 9.818653278735163e-06, + "loss": 0.5886, + "step": 1279 + }, + { + "epoch": 0.8876560332871013, + "grad_norm": 0.34807479241564754, + "learning_rate": 9.818006683691547e-06, + "loss": 0.5312, + "step": 1280 + }, + { + "epoch": 0.8883495145631068, + "grad_norm": 0.36630083180958223, + "learning_rate": 9.817358959342057e-06, + "loss": 0.5635, + "step": 1281 + }, + { + "epoch": 0.8890429958391124, + "grad_norm": 0.34322476466741053, + "learning_rate": 9.81671010583852e-06, + "loss": 0.5713, + "step": 1282 + }, + { + "epoch": 0.8897364771151179, + "grad_norm": 0.31767451134940866, + "learning_rate": 9.816060123333016e-06, + "loss": 0.4881, + "step": 1283 + }, + { + "epoch": 0.8904299583911235, + "grad_norm": 0.36504358027982725, + "learning_rate": 9.815409011977899e-06, + "loss": 0.5475, + "step": 1284 + }, + { + "epoch": 0.891123439667129, + "grad_norm": 0.3294562400040061, + "learning_rate": 9.814756771925785e-06, + "loss": 0.5629, + "step": 1285 + }, + { + "epoch": 0.8918169209431346, + "grad_norm": 0.3162759299057603, + "learning_rate": 9.814103403329552e-06, + "loss": 0.5342, + "step": 1286 + }, + { + "epoch": 0.8925104022191401, + "grad_norm": 0.3323054397959687, + "learning_rate": 9.813448906342348e-06, + "loss": 0.5783, + "step": 1287 + }, + { + "epoch": 0.8932038834951457, + "grad_norm": 0.31126861920575605, + "learning_rate": 9.81279328111758e-06, + "loss": 0.5362, + "step": 1288 + }, + { + "epoch": 0.8938973647711512, + "grad_norm": 0.36914508610428837, + "learning_rate": 9.812136527808924e-06, + "loss": 0.564, + "step": 1289 + }, + { + "epoch": 0.8945908460471568, + "grad_norm": 0.3538761761079088, + "learning_rate": 9.811478646570316e-06, + "loss": 0.5656, + "step": 1290 + }, + { + "epoch": 0.8952843273231623, + "grad_norm": 0.3424567488034912, + "learning_rate": 9.810819637555961e-06, + "loss": 0.5709, + "step": 1291 + }, + { + "epoch": 0.8959778085991679, + "grad_norm": 0.3191712852224483, + "learning_rate": 9.810159500920324e-06, + "loss": 0.5277, + "step": 1292 + }, + { + "epoch": 0.8966712898751734, + "grad_norm": 0.36731342267432715, + "learning_rate": 9.809498236818136e-06, + "loss": 0.5178, + "step": 1293 + }, + { + "epoch": 0.897364771151179, + "grad_norm": 0.3377594492144452, + "learning_rate": 9.808835845404393e-06, + "loss": 0.5976, + "step": 1294 + }, + { + "epoch": 0.8980582524271845, + "grad_norm": 0.3170764097891206, + "learning_rate": 9.808172326834356e-06, + "loss": 0.5529, + "step": 1295 + }, + { + "epoch": 0.8987517337031901, + "grad_norm": 0.34066135968844374, + "learning_rate": 9.807507681263549e-06, + "loss": 0.5612, + "step": 1296 + }, + { + "epoch": 0.8994452149791956, + "grad_norm": 0.3325589108742296, + "learning_rate": 9.806841908847758e-06, + "loss": 0.5772, + "step": 1297 + }, + { + "epoch": 0.9001386962552012, + "grad_norm": 0.31731030396206666, + "learning_rate": 9.806175009743035e-06, + "loss": 0.5435, + "step": 1298 + }, + { + "epoch": 0.9008321775312067, + "grad_norm": 0.31327367398706046, + "learning_rate": 9.8055069841057e-06, + "loss": 0.5438, + "step": 1299 + }, + { + "epoch": 0.9015256588072122, + "grad_norm": 0.31364140750488745, + "learning_rate": 9.80483783209233e-06, + "loss": 0.5796, + "step": 1300 + }, + { + "epoch": 0.9022191400832178, + "grad_norm": 0.31838352680737114, + "learning_rate": 9.80416755385977e-06, + "loss": 0.5677, + "step": 1301 + }, + { + "epoch": 0.9029126213592233, + "grad_norm": 0.33174094161455114, + "learning_rate": 9.80349614956513e-06, + "loss": 0.5774, + "step": 1302 + }, + { + "epoch": 0.9036061026352289, + "grad_norm": 0.3406181144519224, + "learning_rate": 9.80282361936578e-06, + "loss": 0.5012, + "step": 1303 + }, + { + "epoch": 0.9042995839112344, + "grad_norm": 0.3457856925916943, + "learning_rate": 9.802149963419356e-06, + "loss": 0.5474, + "step": 1304 + }, + { + "epoch": 0.90499306518724, + "grad_norm": 0.3712531282790912, + "learning_rate": 9.801475181883763e-06, + "loss": 0.6067, + "step": 1305 + }, + { + "epoch": 0.9056865464632455, + "grad_norm": 0.33754065670955147, + "learning_rate": 9.800799274917159e-06, + "loss": 0.5766, + "step": 1306 + }, + { + "epoch": 0.9063800277392511, + "grad_norm": 0.35865679017804203, + "learning_rate": 9.800122242677975e-06, + "loss": 0.5808, + "step": 1307 + }, + { + "epoch": 0.9070735090152566, + "grad_norm": 0.30684712161365424, + "learning_rate": 9.7994440853249e-06, + "loss": 0.5465, + "step": 1308 + }, + { + "epoch": 0.9077669902912622, + "grad_norm": 0.30032159591038865, + "learning_rate": 9.798764803016892e-06, + "loss": 0.495, + "step": 1309 + }, + { + "epoch": 0.9084604715672677, + "grad_norm": 0.3720484777884542, + "learning_rate": 9.798084395913167e-06, + "loss": 0.5922, + "step": 1310 + }, + { + "epoch": 0.9091539528432733, + "grad_norm": 0.3867791943651646, + "learning_rate": 9.79740286417321e-06, + "loss": 0.5849, + "step": 1311 + }, + { + "epoch": 0.9098474341192788, + "grad_norm": 0.32768493521246145, + "learning_rate": 9.796720207956765e-06, + "loss": 0.5702, + "step": 1312 + }, + { + "epoch": 0.9105409153952844, + "grad_norm": 0.34942558541562957, + "learning_rate": 9.796036427423844e-06, + "loss": 0.5856, + "step": 1313 + }, + { + "epoch": 0.9112343966712899, + "grad_norm": 0.3744950031666534, + "learning_rate": 9.795351522734718e-06, + "loss": 0.5531, + "step": 1314 + }, + { + "epoch": 0.9119278779472955, + "grad_norm": 0.3282846607161552, + "learning_rate": 9.794665494049926e-06, + "loss": 0.5593, + "step": 1315 + }, + { + "epoch": 0.912621359223301, + "grad_norm": 0.5530462075254108, + "learning_rate": 9.793978341530265e-06, + "loss": 0.5778, + "step": 1316 + }, + { + "epoch": 0.9133148404993066, + "grad_norm": 0.3510633295830234, + "learning_rate": 9.793290065336802e-06, + "loss": 0.6148, + "step": 1317 + }, + { + "epoch": 0.9140083217753121, + "grad_norm": 0.3416426392043918, + "learning_rate": 9.792600665630862e-06, + "loss": 0.5365, + "step": 1318 + }, + { + "epoch": 0.9147018030513177, + "grad_norm": 0.3214166710578825, + "learning_rate": 9.791910142574035e-06, + "loss": 0.6077, + "step": 1319 + }, + { + "epoch": 0.9153952843273232, + "grad_norm": 0.3058861338392189, + "learning_rate": 9.791218496328176e-06, + "loss": 0.5727, + "step": 1320 + }, + { + "epoch": 0.9160887656033287, + "grad_norm": 0.36429414031068824, + "learning_rate": 9.7905257270554e-06, + "loss": 0.5632, + "step": 1321 + }, + { + "epoch": 0.9167822468793343, + "grad_norm": 0.32208427101954895, + "learning_rate": 9.789831834918088e-06, + "loss": 0.5111, + "step": 1322 + }, + { + "epoch": 0.9174757281553398, + "grad_norm": 0.3398682101476737, + "learning_rate": 9.789136820078884e-06, + "loss": 0.5645, + "step": 1323 + }, + { + "epoch": 0.9181692094313454, + "grad_norm": 0.3609865377687872, + "learning_rate": 9.788440682700695e-06, + "loss": 0.5868, + "step": 1324 + }, + { + "epoch": 0.9188626907073509, + "grad_norm": 0.357367726817716, + "learning_rate": 9.787743422946689e-06, + "loss": 0.5962, + "step": 1325 + }, + { + "epoch": 0.9195561719833565, + "grad_norm": 0.3166361463961456, + "learning_rate": 9.787045040980299e-06, + "loss": 0.5694, + "step": 1326 + }, + { + "epoch": 0.920249653259362, + "grad_norm": 0.3301450043009037, + "learning_rate": 9.78634553696522e-06, + "loss": 0.5504, + "step": 1327 + }, + { + "epoch": 0.9209431345353676, + "grad_norm": 0.33184365807605193, + "learning_rate": 9.785644911065411e-06, + "loss": 0.5586, + "step": 1328 + }, + { + "epoch": 0.9216366158113731, + "grad_norm": 0.3688433994965824, + "learning_rate": 9.784943163445095e-06, + "loss": 0.4798, + "step": 1329 + }, + { + "epoch": 0.9223300970873787, + "grad_norm": 0.3618513962661808, + "learning_rate": 9.784240294268756e-06, + "loss": 0.5455, + "step": 1330 + }, + { + "epoch": 0.9230235783633842, + "grad_norm": 0.3457450644886999, + "learning_rate": 9.783536303701141e-06, + "loss": 0.5088, + "step": 1331 + }, + { + "epoch": 0.9237170596393898, + "grad_norm": 0.3486225872824421, + "learning_rate": 9.782831191907261e-06, + "loss": 0.5742, + "step": 1332 + }, + { + "epoch": 0.9244105409153953, + "grad_norm": 0.3315065718942818, + "learning_rate": 9.782124959052388e-06, + "loss": 0.5872, + "step": 1333 + }, + { + "epoch": 0.9251040221914009, + "grad_norm": 0.3227113188069714, + "learning_rate": 9.781417605302059e-06, + "loss": 0.5331, + "step": 1334 + }, + { + "epoch": 0.9257975034674064, + "grad_norm": 0.3263752872926798, + "learning_rate": 9.780709130822071e-06, + "loss": 0.5716, + "step": 1335 + }, + { + "epoch": 0.926490984743412, + "grad_norm": 0.3520999513258659, + "learning_rate": 9.779999535778487e-06, + "loss": 0.596, + "step": 1336 + }, + { + "epoch": 0.9271844660194175, + "grad_norm": 0.3276157528264304, + "learning_rate": 9.779288820337628e-06, + "loss": 0.5524, + "step": 1337 + }, + { + "epoch": 0.9278779472954231, + "grad_norm": 0.33534342304864945, + "learning_rate": 9.778576984666087e-06, + "loss": 0.4673, + "step": 1338 + }, + { + "epoch": 0.9285714285714286, + "grad_norm": 0.33542174245312506, + "learning_rate": 9.777864028930705e-06, + "loss": 0.5751, + "step": 1339 + }, + { + "epoch": 0.9292649098474342, + "grad_norm": 0.3392654105733671, + "learning_rate": 9.7771499532986e-06, + "loss": 0.5306, + "step": 1340 + }, + { + "epoch": 0.9299583911234397, + "grad_norm": 0.31461597060503727, + "learning_rate": 9.776434757937141e-06, + "loss": 0.5665, + "step": 1341 + }, + { + "epoch": 0.9306518723994452, + "grad_norm": 0.592434449703214, + "learning_rate": 9.775718443013969e-06, + "loss": 0.6112, + "step": 1342 + }, + { + "epoch": 0.9313453536754508, + "grad_norm": 0.3427716227598802, + "learning_rate": 9.77500100869698e-06, + "loss": 0.5809, + "step": 1343 + }, + { + "epoch": 0.9320388349514563, + "grad_norm": 0.34024552649717665, + "learning_rate": 9.774282455154338e-06, + "loss": 0.6318, + "step": 1344 + }, + { + "epoch": 0.9327323162274619, + "grad_norm": 0.3701056669384116, + "learning_rate": 9.773562782554467e-06, + "loss": 0.6098, + "step": 1345 + }, + { + "epoch": 0.9334257975034674, + "grad_norm": 0.35332055119718075, + "learning_rate": 9.77284199106605e-06, + "loss": 0.5642, + "step": 1346 + }, + { + "epoch": 0.934119278779473, + "grad_norm": 0.3400850066223855, + "learning_rate": 9.772120080858037e-06, + "loss": 0.6003, + "step": 1347 + }, + { + "epoch": 0.9348127600554785, + "grad_norm": 0.3553058316745625, + "learning_rate": 9.771397052099637e-06, + "loss": 0.5221, + "step": 1348 + }, + { + "epoch": 0.9355062413314841, + "grad_norm": 0.31577122634395005, + "learning_rate": 9.770672904960326e-06, + "loss": 0.5484, + "step": 1349 + }, + { + "epoch": 0.9361997226074896, + "grad_norm": 0.3379216804339027, + "learning_rate": 9.769947639609837e-06, + "loss": 0.5583, + "step": 1350 + }, + { + "epoch": 0.9368932038834952, + "grad_norm": 0.32929958892366473, + "learning_rate": 9.769221256218165e-06, + "loss": 0.5567, + "step": 1351 + }, + { + "epoch": 0.9375866851595007, + "grad_norm": 0.31964291586065235, + "learning_rate": 9.76849375495557e-06, + "loss": 0.5885, + "step": 1352 + }, + { + "epoch": 0.9382801664355063, + "grad_norm": 0.33509612920909093, + "learning_rate": 9.767765135992577e-06, + "loss": 0.5264, + "step": 1353 + }, + { + "epoch": 0.9389736477115118, + "grad_norm": 0.3077192276393366, + "learning_rate": 9.767035399499965e-06, + "loss": 0.5198, + "step": 1354 + }, + { + "epoch": 0.9396671289875174, + "grad_norm": 0.36276586392069327, + "learning_rate": 9.76630454564878e-06, + "loss": 0.4841, + "step": 1355 + }, + { + "epoch": 0.9403606102635229, + "grad_norm": 0.3348890170241416, + "learning_rate": 9.765572574610326e-06, + "loss": 0.5096, + "step": 1356 + }, + { + "epoch": 0.9410540915395285, + "grad_norm": 0.3368382436652647, + "learning_rate": 9.764839486556177e-06, + "loss": 0.6037, + "step": 1357 + }, + { + "epoch": 0.941747572815534, + "grad_norm": 0.3385194917774941, + "learning_rate": 9.764105281658161e-06, + "loss": 0.6269, + "step": 1358 + }, + { + "epoch": 0.9424410540915396, + "grad_norm": 0.320619109857471, + "learning_rate": 9.76336996008837e-06, + "loss": 0.5177, + "step": 1359 + }, + { + "epoch": 0.9431345353675451, + "grad_norm": 0.29871327239631223, + "learning_rate": 9.762633522019159e-06, + "loss": 0.5513, + "step": 1360 + }, + { + "epoch": 0.9438280166435506, + "grad_norm": 0.35889043316034214, + "learning_rate": 9.761895967623141e-06, + "loss": 0.5673, + "step": 1361 + }, + { + "epoch": 0.9445214979195562, + "grad_norm": 0.3203059268859659, + "learning_rate": 9.761157297073196e-06, + "loss": 0.5435, + "step": 1362 + }, + { + "epoch": 0.9452149791955617, + "grad_norm": 0.3511537235065825, + "learning_rate": 9.760417510542464e-06, + "loss": 0.6103, + "step": 1363 + }, + { + "epoch": 0.9459084604715673, + "grad_norm": 0.3604565708923628, + "learning_rate": 9.759676608204342e-06, + "loss": 0.6054, + "step": 1364 + }, + { + "epoch": 0.9466019417475728, + "grad_norm": 0.3342447123223769, + "learning_rate": 9.758934590232495e-06, + "loss": 0.5307, + "step": 1365 + }, + { + "epoch": 0.9472954230235784, + "grad_norm": 0.37965077086134635, + "learning_rate": 9.758191456800848e-06, + "loss": 0.5935, + "step": 1366 + }, + { + "epoch": 0.9479889042995839, + "grad_norm": 0.3478112253973906, + "learning_rate": 9.757447208083582e-06, + "loss": 0.5861, + "step": 1367 + }, + { + "epoch": 0.9486823855755895, + "grad_norm": 0.323960790991273, + "learning_rate": 9.756701844255145e-06, + "loss": 0.5148, + "step": 1368 + }, + { + "epoch": 0.949375866851595, + "grad_norm": 0.34803009997759937, + "learning_rate": 9.755955365490246e-06, + "loss": 0.5328, + "step": 1369 + }, + { + "epoch": 0.9500693481276006, + "grad_norm": 0.37453541262630863, + "learning_rate": 9.755207771963855e-06, + "loss": 0.5353, + "step": 1370 + }, + { + "epoch": 0.9507628294036061, + "grad_norm": 0.3773801044381051, + "learning_rate": 9.754459063851198e-06, + "loss": 0.5594, + "step": 1371 + }, + { + "epoch": 0.9514563106796117, + "grad_norm": 0.33016084135655566, + "learning_rate": 9.753709241327773e-06, + "loss": 0.5512, + "step": 1372 + }, + { + "epoch": 0.9521497919556172, + "grad_norm": 0.33497911250209167, + "learning_rate": 9.752958304569327e-06, + "loss": 0.5447, + "step": 1373 + }, + { + "epoch": 0.9528432732316228, + "grad_norm": 0.34373072248120534, + "learning_rate": 9.75220625375188e-06, + "loss": 0.5838, + "step": 1374 + }, + { + "epoch": 0.9535367545076283, + "grad_norm": 0.30820762039099153, + "learning_rate": 9.7514530890517e-06, + "loss": 0.5234, + "step": 1375 + }, + { + "epoch": 0.9542302357836339, + "grad_norm": 0.4176006756154152, + "learning_rate": 9.750698810645331e-06, + "loss": 0.4976, + "step": 1376 + }, + { + "epoch": 0.9549237170596394, + "grad_norm": 0.33539094543261083, + "learning_rate": 9.749943418709567e-06, + "loss": 0.5293, + "step": 1377 + }, + { + "epoch": 0.955617198335645, + "grad_norm": 0.3324000901041926, + "learning_rate": 9.749186913421465e-06, + "loss": 0.5036, + "step": 1378 + }, + { + "epoch": 0.9563106796116505, + "grad_norm": 0.3812265495386306, + "learning_rate": 9.748429294958345e-06, + "loss": 0.57, + "step": 1379 + }, + { + "epoch": 0.957004160887656, + "grad_norm": 0.34076982875814876, + "learning_rate": 9.74767056349779e-06, + "loss": 0.5548, + "step": 1380 + }, + { + "epoch": 0.9576976421636616, + "grad_norm": 0.37913491187013104, + "learning_rate": 9.74691071921764e-06, + "loss": 0.5821, + "step": 1381 + }, + { + "epoch": 0.9583911234396671, + "grad_norm": 0.3382320688469176, + "learning_rate": 9.746149762295994e-06, + "loss": 0.5591, + "step": 1382 + }, + { + "epoch": 0.9590846047156727, + "grad_norm": 0.3326244476434884, + "learning_rate": 9.745387692911217e-06, + "loss": 0.5585, + "step": 1383 + }, + { + "epoch": 0.9597780859916782, + "grad_norm": 0.3078525234697808, + "learning_rate": 9.744624511241933e-06, + "loss": 0.5572, + "step": 1384 + }, + { + "epoch": 0.9604715672676838, + "grad_norm": 0.30277126124901926, + "learning_rate": 9.743860217467024e-06, + "loss": 0.4968, + "step": 1385 + }, + { + "epoch": 0.9611650485436893, + "grad_norm": 0.33782360856625965, + "learning_rate": 9.74309481176564e-06, + "loss": 0.5236, + "step": 1386 + }, + { + "epoch": 0.9618585298196949, + "grad_norm": 0.38359156051992216, + "learning_rate": 9.742328294317181e-06, + "loss": 0.5844, + "step": 1387 + }, + { + "epoch": 0.9625520110957004, + "grad_norm": 0.35228815113528755, + "learning_rate": 9.741560665301316e-06, + "loss": 0.5921, + "step": 1388 + }, + { + "epoch": 0.963245492371706, + "grad_norm": 0.31593010433623414, + "learning_rate": 9.740791924897973e-06, + "loss": 0.5203, + "step": 1389 + }, + { + "epoch": 0.9639389736477115, + "grad_norm": 0.33872246664457734, + "learning_rate": 9.740022073287334e-06, + "loss": 0.5621, + "step": 1390 + }, + { + "epoch": 0.9646324549237171, + "grad_norm": 0.3481728645175325, + "learning_rate": 9.73925111064985e-06, + "loss": 0.5459, + "step": 1391 + }, + { + "epoch": 0.9653259361997226, + "grad_norm": 0.3541419905260788, + "learning_rate": 9.738479037166231e-06, + "loss": 0.6089, + "step": 1392 + }, + { + "epoch": 0.9660194174757282, + "grad_norm": 0.3504710653288276, + "learning_rate": 9.737705853017442e-06, + "loss": 0.5465, + "step": 1393 + }, + { + "epoch": 0.9667128987517337, + "grad_norm": 0.3063424863612362, + "learning_rate": 9.736931558384713e-06, + "loss": 0.498, + "step": 1394 + }, + { + "epoch": 0.9674063800277393, + "grad_norm": 0.37557717405228963, + "learning_rate": 9.736156153449534e-06, + "loss": 0.5892, + "step": 1395 + }, + { + "epoch": 0.9680998613037448, + "grad_norm": 0.34004988273217296, + "learning_rate": 9.735379638393654e-06, + "loss": 0.5521, + "step": 1396 + }, + { + "epoch": 0.9687933425797504, + "grad_norm": 0.3387287763392039, + "learning_rate": 9.73460201339908e-06, + "loss": 0.5778, + "step": 1397 + }, + { + "epoch": 0.9694868238557559, + "grad_norm": 0.3364092001009357, + "learning_rate": 9.733823278648084e-06, + "loss": 0.5313, + "step": 1398 + }, + { + "epoch": 0.9701803051317615, + "grad_norm": 0.33563973505245465, + "learning_rate": 9.733043434323197e-06, + "loss": 0.5812, + "step": 1399 + }, + { + "epoch": 0.970873786407767, + "grad_norm": 0.40173479450985844, + "learning_rate": 9.732262480607207e-06, + "loss": 0.5476, + "step": 1400 + }, + { + "epoch": 0.9715672676837726, + "grad_norm": 0.34481293985057604, + "learning_rate": 9.731480417683163e-06, + "loss": 0.569, + "step": 1401 + }, + { + "epoch": 0.9722607489597781, + "grad_norm": 0.3169175051150628, + "learning_rate": 9.730697245734377e-06, + "loss": 0.5368, + "step": 1402 + }, + { + "epoch": 0.9729542302357836, + "grad_norm": 0.3519871300399793, + "learning_rate": 9.729912964944419e-06, + "loss": 0.5382, + "step": 1403 + }, + { + "epoch": 0.9736477115117892, + "grad_norm": 0.3799305843804598, + "learning_rate": 9.729127575497116e-06, + "loss": 0.5176, + "step": 1404 + }, + { + "epoch": 0.9743411927877947, + "grad_norm": 0.33770682525361484, + "learning_rate": 9.72834107757656e-06, + "loss": 0.5603, + "step": 1405 + }, + { + "epoch": 0.9750346740638003, + "grad_norm": 0.34256064354796206, + "learning_rate": 9.727553471367099e-06, + "loss": 0.5568, + "step": 1406 + }, + { + "epoch": 0.9757281553398058, + "grad_norm": 0.34803047833693523, + "learning_rate": 9.726764757053343e-06, + "loss": 0.6481, + "step": 1407 + }, + { + "epoch": 0.9764216366158114, + "grad_norm": 0.3814251518224469, + "learning_rate": 9.725974934820162e-06, + "loss": 0.6152, + "step": 1408 + }, + { + "epoch": 0.9771151178918169, + "grad_norm": 0.35670775377754504, + "learning_rate": 9.725184004852681e-06, + "loss": 0.5648, + "step": 1409 + }, + { + "epoch": 0.9778085991678225, + "grad_norm": 0.32713035388587347, + "learning_rate": 9.724391967336293e-06, + "loss": 0.531, + "step": 1410 + }, + { + "epoch": 0.978502080443828, + "grad_norm": 0.3309089807646535, + "learning_rate": 9.723598822456643e-06, + "loss": 0.5004, + "step": 1411 + }, + { + "epoch": 0.9791955617198336, + "grad_norm": 0.3189261185875698, + "learning_rate": 9.722804570399638e-06, + "loss": 0.5492, + "step": 1412 + }, + { + "epoch": 0.9798890429958391, + "grad_norm": 0.3212823172868208, + "learning_rate": 9.722009211351447e-06, + "loss": 0.53, + "step": 1413 + }, + { + "epoch": 0.9805825242718447, + "grad_norm": 0.32937476591309023, + "learning_rate": 9.721212745498493e-06, + "loss": 0.5836, + "step": 1414 + }, + { + "epoch": 0.9812760055478502, + "grad_norm": 0.32920213673407056, + "learning_rate": 9.720415173027466e-06, + "loss": 0.5727, + "step": 1415 + }, + { + "epoch": 0.9819694868238558, + "grad_norm": 0.3186506108728332, + "learning_rate": 9.719616494125311e-06, + "loss": 0.5419, + "step": 1416 + }, + { + "epoch": 0.9826629680998613, + "grad_norm": 0.3454233802084189, + "learning_rate": 9.718816708979228e-06, + "loss": 0.6737, + "step": 1417 + }, + { + "epoch": 0.9833564493758669, + "grad_norm": 0.3456448789102721, + "learning_rate": 9.718015817776684e-06, + "loss": 0.6151, + "step": 1418 + }, + { + "epoch": 0.9840499306518724, + "grad_norm": 0.3596776939403398, + "learning_rate": 9.717213820705403e-06, + "loss": 0.5878, + "step": 1419 + }, + { + "epoch": 0.984743411927878, + "grad_norm": 0.3492056949754736, + "learning_rate": 9.716410717953364e-06, + "loss": 0.5463, + "step": 1420 + }, + { + "epoch": 0.9854368932038835, + "grad_norm": 0.32360037406045555, + "learning_rate": 9.715606509708812e-06, + "loss": 0.5638, + "step": 1421 + }, + { + "epoch": 0.986130374479889, + "grad_norm": 0.4256266939199784, + "learning_rate": 9.714801196160247e-06, + "loss": 0.5373, + "step": 1422 + }, + { + "epoch": 0.9868238557558946, + "grad_norm": 0.34243199989289114, + "learning_rate": 9.713994777496427e-06, + "loss": 0.5427, + "step": 1423 + }, + { + "epoch": 0.9875173370319001, + "grad_norm": 0.35630692729016344, + "learning_rate": 9.71318725390637e-06, + "loss": 0.5726, + "step": 1424 + }, + { + "epoch": 0.9882108183079057, + "grad_norm": 0.32541903837787706, + "learning_rate": 9.712378625579358e-06, + "loss": 0.5682, + "step": 1425 + }, + { + "epoch": 0.9889042995839112, + "grad_norm": 0.33493185831625, + "learning_rate": 9.711568892704924e-06, + "loss": 0.5177, + "step": 1426 + }, + { + "epoch": 0.9895977808599168, + "grad_norm": 0.38164286844146134, + "learning_rate": 9.710758055472862e-06, + "loss": 0.5466, + "step": 1427 + }, + { + "epoch": 0.9902912621359223, + "grad_norm": 0.3363719194150675, + "learning_rate": 9.709946114073231e-06, + "loss": 0.6056, + "step": 1428 + }, + { + "epoch": 0.9909847434119279, + "grad_norm": 0.34442741007792727, + "learning_rate": 9.70913306869634e-06, + "loss": 0.5251, + "step": 1429 + }, + { + "epoch": 0.9916782246879334, + "grad_norm": 0.3359985260331031, + "learning_rate": 9.708318919532766e-06, + "loss": 0.5069, + "step": 1430 + }, + { + "epoch": 0.992371705963939, + "grad_norm": 0.33141103763419355, + "learning_rate": 9.707503666773334e-06, + "loss": 0.507, + "step": 1431 + }, + { + "epoch": 0.9930651872399445, + "grad_norm": 0.31911653176339655, + "learning_rate": 9.706687310609137e-06, + "loss": 0.5459, + "step": 1432 + }, + { + "epoch": 0.9937586685159501, + "grad_norm": 0.33360539375342685, + "learning_rate": 9.705869851231522e-06, + "loss": 0.5217, + "step": 1433 + }, + { + "epoch": 0.9944521497919556, + "grad_norm": 0.3261088579418762, + "learning_rate": 9.705051288832095e-06, + "loss": 0.5828, + "step": 1434 + }, + { + "epoch": 0.9951456310679612, + "grad_norm": 0.33826855892003294, + "learning_rate": 9.704231623602721e-06, + "loss": 0.5251, + "step": 1435 + }, + { + "epoch": 0.9958391123439667, + "grad_norm": 0.32907816015207925, + "learning_rate": 9.703410855735525e-06, + "loss": 0.5328, + "step": 1436 + }, + { + "epoch": 0.9965325936199723, + "grad_norm": 0.288222460017734, + "learning_rate": 9.702588985422887e-06, + "loss": 0.4961, + "step": 1437 + }, + { + "epoch": 0.9972260748959778, + "grad_norm": 0.3641372537449462, + "learning_rate": 9.701766012857448e-06, + "loss": 0.5562, + "step": 1438 + }, + { + "epoch": 0.9979195561719834, + "grad_norm": 0.3298489645493611, + "learning_rate": 9.700941938232108e-06, + "loss": 0.5677, + "step": 1439 + }, + { + "epoch": 0.9986130374479889, + "grad_norm": 0.4119776348892953, + "learning_rate": 9.700116761740024e-06, + "loss": 0.5339, + "step": 1440 + }, + { + "epoch": 0.9993065187239945, + "grad_norm": 0.3159886207892137, + "learning_rate": 9.699290483574611e-06, + "loss": 0.5503, + "step": 1441 + }, + { + "epoch": 1.0, + "grad_norm": 0.41798594104440057, + "learning_rate": 9.698463103929542e-06, + "loss": 0.5842, + "step": 1442 + }, + { + "epoch": 1.0006934812760055, + "grad_norm": 0.30210117536250447, + "learning_rate": 9.69763462299875e-06, + "loss": 0.5206, + "step": 1443 + }, + { + "epoch": 1.001386962552011, + "grad_norm": 0.32902579991011144, + "learning_rate": 9.696805040976425e-06, + "loss": 0.5224, + "step": 1444 + }, + { + "epoch": 1.0020804438280166, + "grad_norm": 0.33674165962137353, + "learning_rate": 9.695974358057012e-06, + "loss": 0.5794, + "step": 1445 + }, + { + "epoch": 1.0027739251040222, + "grad_norm": 0.631204063917515, + "learning_rate": 9.695142574435222e-06, + "loss": 0.5339, + "step": 1446 + }, + { + "epoch": 1.0034674063800277, + "grad_norm": 0.3558127426324594, + "learning_rate": 9.694309690306013e-06, + "loss": 0.5176, + "step": 1447 + }, + { + "epoch": 1.0041608876560333, + "grad_norm": 0.3237328478105614, + "learning_rate": 9.693475705864613e-06, + "loss": 0.4968, + "step": 1448 + }, + { + "epoch": 1.0048543689320388, + "grad_norm": 0.330996494521083, + "learning_rate": 9.692640621306497e-06, + "loss": 0.4693, + "step": 1449 + }, + { + "epoch": 1.0055478502080444, + "grad_norm": 0.3607636151790628, + "learning_rate": 9.691804436827409e-06, + "loss": 0.5404, + "step": 1450 + }, + { + "epoch": 1.00624133148405, + "grad_norm": 0.3430460470026121, + "learning_rate": 9.690967152623337e-06, + "loss": 0.5319, + "step": 1451 + }, + { + "epoch": 1.0069348127600555, + "grad_norm": 0.3569378890517885, + "learning_rate": 9.690128768890538e-06, + "loss": 0.5697, + "step": 1452 + }, + { + "epoch": 1.007628294036061, + "grad_norm": 0.3018546951841948, + "learning_rate": 9.689289285825526e-06, + "loss": 0.4494, + "step": 1453 + }, + { + "epoch": 1.0083217753120666, + "grad_norm": 0.32523767944287174, + "learning_rate": 9.688448703625063e-06, + "loss": 0.5394, + "step": 1454 + }, + { + "epoch": 1.0090152565880721, + "grad_norm": 0.3445515603593232, + "learning_rate": 9.687607022486183e-06, + "loss": 0.5127, + "step": 1455 + }, + { + "epoch": 1.0097087378640777, + "grad_norm": 0.33812903590257065, + "learning_rate": 9.686764242606164e-06, + "loss": 0.5398, + "step": 1456 + }, + { + "epoch": 1.0104022191400832, + "grad_norm": 0.32268794575907395, + "learning_rate": 9.68592036418255e-06, + "loss": 0.5247, + "step": 1457 + }, + { + "epoch": 1.0110957004160888, + "grad_norm": 0.34210414184527377, + "learning_rate": 9.685075387413139e-06, + "loss": 0.5244, + "step": 1458 + }, + { + "epoch": 1.0117891816920943, + "grad_norm": 0.3368594866088289, + "learning_rate": 9.68422931249599e-06, + "loss": 0.5604, + "step": 1459 + }, + { + "epoch": 1.0124826629680999, + "grad_norm": 0.34154410786238604, + "learning_rate": 9.683382139629414e-06, + "loss": 0.5333, + "step": 1460 + }, + { + "epoch": 1.0131761442441054, + "grad_norm": 0.34571869034008895, + "learning_rate": 9.682533869011983e-06, + "loss": 0.4838, + "step": 1461 + }, + { + "epoch": 1.013869625520111, + "grad_norm": 0.3146540126331417, + "learning_rate": 9.681684500842525e-06, + "loss": 0.5121, + "step": 1462 + }, + { + "epoch": 1.0145631067961165, + "grad_norm": 0.3125075765645134, + "learning_rate": 9.680834035320127e-06, + "loss": 0.5131, + "step": 1463 + }, + { + "epoch": 1.015256588072122, + "grad_norm": 0.39937888878270666, + "learning_rate": 9.679982472644132e-06, + "loss": 0.5486, + "step": 1464 + }, + { + "epoch": 1.0159500693481276, + "grad_norm": 0.3984132940813232, + "learning_rate": 9.679129813014137e-06, + "loss": 0.5956, + "step": 1465 + }, + { + "epoch": 1.0166435506241331, + "grad_norm": 0.33557152839212523, + "learning_rate": 9.678276056630005e-06, + "loss": 0.4936, + "step": 1466 + }, + { + "epoch": 1.0173370319001387, + "grad_norm": 0.3358805794503355, + "learning_rate": 9.677421203691844e-06, + "loss": 0.4719, + "step": 1467 + }, + { + "epoch": 1.0180305131761442, + "grad_norm": 0.3422084296775962, + "learning_rate": 9.67656525440003e-06, + "loss": 0.4905, + "step": 1468 + }, + { + "epoch": 1.0187239944521498, + "grad_norm": 0.3244902181064797, + "learning_rate": 9.67570820895519e-06, + "loss": 0.5092, + "step": 1469 + }, + { + "epoch": 1.0194174757281553, + "grad_norm": 0.34140129804374164, + "learning_rate": 9.674850067558209e-06, + "loss": 0.529, + "step": 1470 + }, + { + "epoch": 1.0201109570041609, + "grad_norm": 0.36152772655245036, + "learning_rate": 9.673990830410227e-06, + "loss": 0.5079, + "step": 1471 + }, + { + "epoch": 1.0208044382801664, + "grad_norm": 0.37207997894308903, + "learning_rate": 9.673130497712646e-06, + "loss": 0.5246, + "step": 1472 + }, + { + "epoch": 1.021497919556172, + "grad_norm": 0.34537185865971043, + "learning_rate": 9.672269069667122e-06, + "loss": 0.4273, + "step": 1473 + }, + { + "epoch": 1.0221914008321775, + "grad_norm": 0.41690655380322045, + "learning_rate": 9.671406546475564e-06, + "loss": 0.4621, + "step": 1474 + }, + { + "epoch": 1.022884882108183, + "grad_norm": 0.3458134469043695, + "learning_rate": 9.670542928340145e-06, + "loss": 0.5281, + "step": 1475 + }, + { + "epoch": 1.0235783633841886, + "grad_norm": 0.37397991921336665, + "learning_rate": 9.669678215463289e-06, + "loss": 0.5234, + "step": 1476 + }, + { + "epoch": 1.0242718446601942, + "grad_norm": 0.3439034685625693, + "learning_rate": 9.66881240804768e-06, + "loss": 0.4929, + "step": 1477 + }, + { + "epoch": 1.0249653259361997, + "grad_norm": 0.34605004852510335, + "learning_rate": 9.667945506296252e-06, + "loss": 0.5, + "step": 1478 + }, + { + "epoch": 1.0256588072122053, + "grad_norm": 0.35541572537780564, + "learning_rate": 9.667077510412206e-06, + "loss": 0.4906, + "step": 1479 + }, + { + "epoch": 1.0263522884882108, + "grad_norm": 0.3345591255482935, + "learning_rate": 9.666208420598993e-06, + "loss": 0.5132, + "step": 1480 + }, + { + "epoch": 1.0270457697642164, + "grad_norm": 0.2982284334195003, + "learning_rate": 9.66533823706032e-06, + "loss": 0.4397, + "step": 1481 + }, + { + "epoch": 1.027739251040222, + "grad_norm": 0.4170703038184141, + "learning_rate": 9.664466960000152e-06, + "loss": 0.5478, + "step": 1482 + }, + { + "epoch": 1.0284327323162274, + "grad_norm": 0.3775213410406168, + "learning_rate": 9.663594589622711e-06, + "loss": 0.573, + "step": 1483 + }, + { + "epoch": 1.029126213592233, + "grad_norm": 0.36964517040338624, + "learning_rate": 9.662721126132473e-06, + "loss": 0.5079, + "step": 1484 + }, + { + "epoch": 1.0298196948682385, + "grad_norm": 0.3232631642537663, + "learning_rate": 9.661846569734173e-06, + "loss": 0.4855, + "step": 1485 + }, + { + "epoch": 1.030513176144244, + "grad_norm": 0.3273358873941601, + "learning_rate": 9.660970920632798e-06, + "loss": 0.4769, + "step": 1486 + }, + { + "epoch": 1.0312066574202496, + "grad_norm": 0.36800348272304073, + "learning_rate": 9.660094179033596e-06, + "loss": 0.5083, + "step": 1487 + }, + { + "epoch": 1.0319001386962552, + "grad_norm": 0.36080054248376503, + "learning_rate": 9.659216345142068e-06, + "loss": 0.6337, + "step": 1488 + }, + { + "epoch": 1.0325936199722607, + "grad_norm": 0.36832389123711534, + "learning_rate": 9.658337419163973e-06, + "loss": 0.5422, + "step": 1489 + }, + { + "epoch": 1.0332871012482663, + "grad_norm": 0.3267218741240164, + "learning_rate": 9.657457401305324e-06, + "loss": 0.4771, + "step": 1490 + }, + { + "epoch": 1.0339805825242718, + "grad_norm": 0.35264188713803, + "learning_rate": 9.656576291772392e-06, + "loss": 0.5254, + "step": 1491 + }, + { + "epoch": 1.0346740638002774, + "grad_norm": 0.3560121286153758, + "learning_rate": 9.655694090771701e-06, + "loss": 0.5275, + "step": 1492 + }, + { + "epoch": 1.035367545076283, + "grad_norm": 0.3517252354642808, + "learning_rate": 9.654810798510033e-06, + "loss": 0.47, + "step": 1493 + }, + { + "epoch": 1.0360610263522885, + "grad_norm": 0.36263265506354103, + "learning_rate": 9.653926415194426e-06, + "loss": 0.4917, + "step": 1494 + }, + { + "epoch": 1.036754507628294, + "grad_norm": 0.31992687288964183, + "learning_rate": 9.653040941032173e-06, + "loss": 0.4932, + "step": 1495 + }, + { + "epoch": 1.0374479889042996, + "grad_norm": 0.45065245541497273, + "learning_rate": 9.652154376230822e-06, + "loss": 0.4644, + "step": 1496 + }, + { + "epoch": 1.0381414701803051, + "grad_norm": 0.3501687836457089, + "learning_rate": 9.651266720998176e-06, + "loss": 0.5284, + "step": 1497 + }, + { + "epoch": 1.0388349514563107, + "grad_norm": 0.29717922058653945, + "learning_rate": 9.650377975542298e-06, + "loss": 0.4688, + "step": 1498 + }, + { + "epoch": 1.0395284327323162, + "grad_norm": 0.46145110093051234, + "learning_rate": 9.649488140071503e-06, + "loss": 0.5344, + "step": 1499 + }, + { + "epoch": 1.0402219140083218, + "grad_norm": 0.3253067300049634, + "learning_rate": 9.64859721479436e-06, + "loss": 0.5179, + "step": 1500 + }, + { + "epoch": 1.0409153952843273, + "grad_norm": 0.4273424033637859, + "learning_rate": 9.647705199919697e-06, + "loss": 0.5554, + "step": 1501 + }, + { + "epoch": 1.0416088765603329, + "grad_norm": 0.3377962612132626, + "learning_rate": 9.646812095656595e-06, + "loss": 0.547, + "step": 1502 + }, + { + "epoch": 1.0423023578363384, + "grad_norm": 0.3181516767383569, + "learning_rate": 9.645917902214393e-06, + "loss": 0.494, + "step": 1503 + }, + { + "epoch": 1.042995839112344, + "grad_norm": 0.3529567066677283, + "learning_rate": 9.64502261980268e-06, + "loss": 0.4958, + "step": 1504 + }, + { + "epoch": 1.0436893203883495, + "grad_norm": 0.3974998943327576, + "learning_rate": 9.644126248631306e-06, + "loss": 0.5329, + "step": 1505 + }, + { + "epoch": 1.044382801664355, + "grad_norm": 0.3627345336594509, + "learning_rate": 9.643228788910374e-06, + "loss": 0.5721, + "step": 1506 + }, + { + "epoch": 1.0450762829403606, + "grad_norm": 0.3447151808243214, + "learning_rate": 9.642330240850244e-06, + "loss": 0.4987, + "step": 1507 + }, + { + "epoch": 1.0457697642163661, + "grad_norm": 0.31171379839843344, + "learning_rate": 9.641430604661523e-06, + "loss": 0.4798, + "step": 1508 + }, + { + "epoch": 1.0464632454923717, + "grad_norm": 0.3491924560910556, + "learning_rate": 9.640529880555086e-06, + "loss": 0.5327, + "step": 1509 + }, + { + "epoch": 1.0471567267683772, + "grad_norm": 0.3470118733588684, + "learning_rate": 9.639628068742053e-06, + "loss": 0.4927, + "step": 1510 + }, + { + "epoch": 1.0478502080443828, + "grad_norm": 0.3165401535268853, + "learning_rate": 9.638725169433801e-06, + "loss": 0.5064, + "step": 1511 + }, + { + "epoch": 1.0485436893203883, + "grad_norm": 0.3450271285443684, + "learning_rate": 9.637821182841965e-06, + "loss": 0.5146, + "step": 1512 + }, + { + "epoch": 1.0492371705963939, + "grad_norm": 0.35980398821611514, + "learning_rate": 9.636916109178433e-06, + "loss": 0.5088, + "step": 1513 + }, + { + "epoch": 1.0499306518723994, + "grad_norm": 0.32696446968109916, + "learning_rate": 9.636009948655348e-06, + "loss": 0.5535, + "step": 1514 + }, + { + "epoch": 1.050624133148405, + "grad_norm": 0.35722384331251, + "learning_rate": 9.635102701485103e-06, + "loss": 0.5224, + "step": 1515 + }, + { + "epoch": 1.0513176144244105, + "grad_norm": 0.34472637255773353, + "learning_rate": 9.634194367880357e-06, + "loss": 0.5181, + "step": 1516 + }, + { + "epoch": 1.052011095700416, + "grad_norm": 0.331116228361004, + "learning_rate": 9.633284948054014e-06, + "loss": 0.5302, + "step": 1517 + }, + { + "epoch": 1.0527045769764216, + "grad_norm": 0.3221314677000127, + "learning_rate": 9.632374442219232e-06, + "loss": 0.5154, + "step": 1518 + }, + { + "epoch": 1.0533980582524272, + "grad_norm": 0.386738543078458, + "learning_rate": 9.631462850589432e-06, + "loss": 0.4517, + "step": 1519 + }, + { + "epoch": 1.0540915395284327, + "grad_norm": 0.3421683778508663, + "learning_rate": 9.630550173378283e-06, + "loss": 0.477, + "step": 1520 + }, + { + "epoch": 1.0547850208044383, + "grad_norm": 0.36648948977771956, + "learning_rate": 9.629636410799709e-06, + "loss": 0.5416, + "step": 1521 + }, + { + "epoch": 1.0554785020804438, + "grad_norm": 0.3480063837990978, + "learning_rate": 9.628721563067888e-06, + "loss": 0.5573, + "step": 1522 + }, + { + "epoch": 1.0561719833564494, + "grad_norm": 0.3412320378121699, + "learning_rate": 9.627805630397257e-06, + "loss": 0.5525, + "step": 1523 + }, + { + "epoch": 1.056865464632455, + "grad_norm": 0.34131100799573993, + "learning_rate": 9.626888613002502e-06, + "loss": 0.4414, + "step": 1524 + }, + { + "epoch": 1.0575589459084604, + "grad_norm": 0.3416796835258787, + "learning_rate": 9.625970511098566e-06, + "loss": 0.4887, + "step": 1525 + }, + { + "epoch": 1.058252427184466, + "grad_norm": 0.35632388953029986, + "learning_rate": 9.625051324900645e-06, + "loss": 0.5043, + "step": 1526 + }, + { + "epoch": 1.0589459084604715, + "grad_norm": 0.5440807322096927, + "learning_rate": 9.624131054624189e-06, + "loss": 0.4934, + "step": 1527 + }, + { + "epoch": 1.059639389736477, + "grad_norm": 0.3308052835280193, + "learning_rate": 9.623209700484903e-06, + "loss": 0.4871, + "step": 1528 + }, + { + "epoch": 1.0603328710124826, + "grad_norm": 0.3267267804163821, + "learning_rate": 9.622287262698748e-06, + "loss": 0.5444, + "step": 1529 + }, + { + "epoch": 1.0610263522884882, + "grad_norm": 0.33432560623564445, + "learning_rate": 9.621363741481933e-06, + "loss": 0.5253, + "step": 1530 + }, + { + "epoch": 1.0617198335644937, + "grad_norm": 0.3712652577662211, + "learning_rate": 9.620439137050927e-06, + "loss": 0.5067, + "step": 1531 + }, + { + "epoch": 1.0624133148404993, + "grad_norm": 0.3864313408235188, + "learning_rate": 9.619513449622451e-06, + "loss": 0.6148, + "step": 1532 + }, + { + "epoch": 1.0631067961165048, + "grad_norm": 0.3513007549930898, + "learning_rate": 9.618586679413477e-06, + "loss": 0.5435, + "step": 1533 + }, + { + "epoch": 1.0638002773925104, + "grad_norm": 0.30495454581270853, + "learning_rate": 9.617658826641235e-06, + "loss": 0.4324, + "step": 1534 + }, + { + "epoch": 1.064493758668516, + "grad_norm": 0.35225997807973775, + "learning_rate": 9.616729891523207e-06, + "loss": 0.5452, + "step": 1535 + }, + { + "epoch": 1.0651872399445215, + "grad_norm": 0.3384302270424511, + "learning_rate": 9.61579987427713e-06, + "loss": 0.4571, + "step": 1536 + }, + { + "epoch": 1.065880721220527, + "grad_norm": 0.3644638546315168, + "learning_rate": 9.61486877512099e-06, + "loss": 0.5117, + "step": 1537 + }, + { + "epoch": 1.0665742024965326, + "grad_norm": 0.37408064789556505, + "learning_rate": 9.61393659427303e-06, + "loss": 0.5233, + "step": 1538 + }, + { + "epoch": 1.0672676837725381, + "grad_norm": 0.46162782166701477, + "learning_rate": 9.613003331951749e-06, + "loss": 0.5393, + "step": 1539 + }, + { + "epoch": 1.0679611650485437, + "grad_norm": 0.35267804778946427, + "learning_rate": 9.612068988375898e-06, + "loss": 0.5304, + "step": 1540 + }, + { + "epoch": 1.0686546463245492, + "grad_norm": 0.351513786905638, + "learning_rate": 9.611133563764476e-06, + "loss": 0.5513, + "step": 1541 + }, + { + "epoch": 1.0693481276005548, + "grad_norm": 0.3718476035880555, + "learning_rate": 9.610197058336743e-06, + "loss": 0.4899, + "step": 1542 + }, + { + "epoch": 1.0700416088765603, + "grad_norm": 0.3112657247404451, + "learning_rate": 9.609259472312208e-06, + "loss": 0.458, + "step": 1543 + }, + { + "epoch": 1.0707350901525658, + "grad_norm": 0.3309923751917419, + "learning_rate": 9.608320805910633e-06, + "loss": 0.5381, + "step": 1544 + }, + { + "epoch": 1.0714285714285714, + "grad_norm": 0.3245404525893565, + "learning_rate": 9.60738105935204e-06, + "loss": 0.5078, + "step": 1545 + }, + { + "epoch": 1.072122052704577, + "grad_norm": 0.31387432841180757, + "learning_rate": 9.60644023285669e-06, + "loss": 0.4501, + "step": 1546 + }, + { + "epoch": 1.0728155339805825, + "grad_norm": 0.3540152884221349, + "learning_rate": 9.605498326645115e-06, + "loss": 0.5162, + "step": 1547 + }, + { + "epoch": 1.073509015256588, + "grad_norm": 0.331812147139016, + "learning_rate": 9.604555340938084e-06, + "loss": 0.4702, + "step": 1548 + }, + { + "epoch": 1.0742024965325936, + "grad_norm": 0.31369777450737346, + "learning_rate": 9.603611275956632e-06, + "loss": 0.4658, + "step": 1549 + }, + { + "epoch": 1.0748959778085991, + "grad_norm": 0.3772426365384512, + "learning_rate": 9.602666131922036e-06, + "loss": 0.5577, + "step": 1550 + }, + { + "epoch": 1.0755894590846047, + "grad_norm": 0.36915143235135567, + "learning_rate": 9.60171990905583e-06, + "loss": 0.4429, + "step": 1551 + }, + { + "epoch": 1.0762829403606102, + "grad_norm": 0.3658905222328625, + "learning_rate": 9.60077260757981e-06, + "loss": 0.5529, + "step": 1552 + }, + { + "epoch": 1.0769764216366158, + "grad_norm": 0.3597757970231172, + "learning_rate": 9.599824227716007e-06, + "loss": 0.5574, + "step": 1553 + }, + { + "epoch": 1.0776699029126213, + "grad_norm": 0.3406320223217798, + "learning_rate": 9.598874769686721e-06, + "loss": 0.4445, + "step": 1554 + }, + { + "epoch": 1.0783633841886269, + "grad_norm": 0.3176393628455282, + "learning_rate": 9.597924233714494e-06, + "loss": 0.4514, + "step": 1555 + }, + { + "epoch": 1.0790568654646324, + "grad_norm": 0.35286977008213233, + "learning_rate": 9.596972620022127e-06, + "loss": 0.5389, + "step": 1556 + }, + { + "epoch": 1.079750346740638, + "grad_norm": 0.36820121109917364, + "learning_rate": 9.59601992883267e-06, + "loss": 0.5297, + "step": 1557 + }, + { + "epoch": 1.0804438280166435, + "grad_norm": 0.43598247777508037, + "learning_rate": 9.595066160369428e-06, + "loss": 0.5182, + "step": 1558 + }, + { + "epoch": 1.081137309292649, + "grad_norm": 0.3169545052914446, + "learning_rate": 9.594111314855957e-06, + "loss": 0.4946, + "step": 1559 + }, + { + "epoch": 1.0818307905686546, + "grad_norm": 0.35954800326925107, + "learning_rate": 9.593155392516066e-06, + "loss": 0.5161, + "step": 1560 + }, + { + "epoch": 1.0825242718446602, + "grad_norm": 0.35787675211563713, + "learning_rate": 9.592198393573816e-06, + "loss": 0.5184, + "step": 1561 + }, + { + "epoch": 1.0832177531206657, + "grad_norm": 0.31632027854129957, + "learning_rate": 9.591240318253521e-06, + "loss": 0.4736, + "step": 1562 + }, + { + "epoch": 1.0839112343966713, + "grad_norm": 2.9569303138652336, + "learning_rate": 9.590281166779747e-06, + "loss": 0.4809, + "step": 1563 + }, + { + "epoch": 1.0846047156726768, + "grad_norm": 0.3245619107218756, + "learning_rate": 9.58932093937731e-06, + "loss": 0.4868, + "step": 1564 + }, + { + "epoch": 1.0852981969486823, + "grad_norm": 0.3558834445926099, + "learning_rate": 9.588359636271284e-06, + "loss": 0.5747, + "step": 1565 + }, + { + "epoch": 1.085991678224688, + "grad_norm": 0.3732340685361256, + "learning_rate": 9.587397257686992e-06, + "loss": 0.563, + "step": 1566 + }, + { + "epoch": 1.0866851595006934, + "grad_norm": 0.3444506803272316, + "learning_rate": 9.586433803850002e-06, + "loss": 0.527, + "step": 1567 + }, + { + "epoch": 1.087378640776699, + "grad_norm": 0.34876944378169805, + "learning_rate": 9.585469274986148e-06, + "loss": 0.4909, + "step": 1568 + }, + { + "epoch": 1.0880721220527045, + "grad_norm": 0.34385821310745096, + "learning_rate": 9.584503671321503e-06, + "loss": 0.5332, + "step": 1569 + }, + { + "epoch": 1.08876560332871, + "grad_norm": 0.3237590691468841, + "learning_rate": 9.583536993082402e-06, + "loss": 0.5115, + "step": 1570 + }, + { + "epoch": 1.0894590846047156, + "grad_norm": 0.3633712964933024, + "learning_rate": 9.582569240495426e-06, + "loss": 0.4967, + "step": 1571 + }, + { + "epoch": 1.0901525658807212, + "grad_norm": 0.3604223774031506, + "learning_rate": 9.581600413787406e-06, + "loss": 0.6134, + "step": 1572 + }, + { + "epoch": 1.0908460471567267, + "grad_norm": 0.3988966746488306, + "learning_rate": 9.580630513185431e-06, + "loss": 0.5194, + "step": 1573 + }, + { + "epoch": 1.0915395284327323, + "grad_norm": 0.3885768601621719, + "learning_rate": 9.579659538916839e-06, + "loss": 0.5342, + "step": 1574 + }, + { + "epoch": 1.0922330097087378, + "grad_norm": 0.33852293558354857, + "learning_rate": 9.578687491209219e-06, + "loss": 0.525, + "step": 1575 + }, + { + "epoch": 1.0929264909847434, + "grad_norm": 0.33923252441294693, + "learning_rate": 9.57771437029041e-06, + "loss": 0.4983, + "step": 1576 + }, + { + "epoch": 1.093619972260749, + "grad_norm": 0.3122927651682097, + "learning_rate": 9.576740176388508e-06, + "loss": 0.462, + "step": 1577 + }, + { + "epoch": 1.0943134535367545, + "grad_norm": 0.3529504391622095, + "learning_rate": 9.575764909731853e-06, + "loss": 0.4737, + "step": 1578 + }, + { + "epoch": 1.09500693481276, + "grad_norm": 0.3375825793054644, + "learning_rate": 9.574788570549043e-06, + "loss": 0.5378, + "step": 1579 + }, + { + "epoch": 1.0957004160887656, + "grad_norm": 0.3600900993969908, + "learning_rate": 9.573811159068925e-06, + "loss": 0.5084, + "step": 1580 + }, + { + "epoch": 1.096393897364771, + "grad_norm": 0.3412614032725487, + "learning_rate": 9.572832675520595e-06, + "loss": 0.5036, + "step": 1581 + }, + { + "epoch": 1.0970873786407767, + "grad_norm": 0.3302963055916388, + "learning_rate": 9.571853120133406e-06, + "loss": 0.5398, + "step": 1582 + }, + { + "epoch": 1.0977808599167822, + "grad_norm": 0.4466510484420232, + "learning_rate": 9.570872493136954e-06, + "loss": 0.4834, + "step": 1583 + }, + { + "epoch": 1.0984743411927878, + "grad_norm": 0.47091943476306125, + "learning_rate": 9.569890794761095e-06, + "loss": 0.5083, + "step": 1584 + }, + { + "epoch": 1.0991678224687933, + "grad_norm": 0.33670908274267125, + "learning_rate": 9.56890802523593e-06, + "loss": 0.5115, + "step": 1585 + }, + { + "epoch": 1.0998613037447988, + "grad_norm": 0.3706401346849016, + "learning_rate": 9.567924184791814e-06, + "loss": 0.5599, + "step": 1586 + }, + { + "epoch": 1.1005547850208044, + "grad_norm": 0.35786892708418927, + "learning_rate": 9.56693927365935e-06, + "loss": 0.5445, + "step": 1587 + }, + { + "epoch": 1.10124826629681, + "grad_norm": 0.32991228579665166, + "learning_rate": 9.565953292069397e-06, + "loss": 0.5125, + "step": 1588 + }, + { + "epoch": 1.1019417475728155, + "grad_norm": 0.38052894985225355, + "learning_rate": 9.564966240253062e-06, + "loss": 0.5141, + "step": 1589 + }, + { + "epoch": 1.102635228848821, + "grad_norm": 0.32925034948767845, + "learning_rate": 9.5639781184417e-06, + "loss": 0.4939, + "step": 1590 + }, + { + "epoch": 1.1033287101248266, + "grad_norm": 0.3578660324964388, + "learning_rate": 9.56298892686692e-06, + "loss": 0.5301, + "step": 1591 + }, + { + "epoch": 1.1040221914008321, + "grad_norm": 0.3361576902296305, + "learning_rate": 9.561998665760582e-06, + "loss": 0.507, + "step": 1592 + }, + { + "epoch": 1.1047156726768377, + "grad_norm": 0.338374185320022, + "learning_rate": 9.561007335354797e-06, + "loss": 0.5011, + "step": 1593 + }, + { + "epoch": 1.1054091539528432, + "grad_norm": 0.4512941687336036, + "learning_rate": 9.560014935881924e-06, + "loss": 0.5435, + "step": 1594 + }, + { + "epoch": 1.1061026352288488, + "grad_norm": 0.36437506730679203, + "learning_rate": 9.559021467574576e-06, + "loss": 0.6453, + "step": 1595 + }, + { + "epoch": 1.1067961165048543, + "grad_norm": 0.3880107069803568, + "learning_rate": 9.558026930665614e-06, + "loss": 0.5415, + "step": 1596 + }, + { + "epoch": 1.1074895977808599, + "grad_norm": 0.33285292398220034, + "learning_rate": 9.55703132538815e-06, + "loss": 0.5075, + "step": 1597 + }, + { + "epoch": 1.1081830790568654, + "grad_norm": 0.36076949458117785, + "learning_rate": 9.556034651975548e-06, + "loss": 0.4632, + "step": 1598 + }, + { + "epoch": 1.108876560332871, + "grad_norm": 0.32784402025878107, + "learning_rate": 9.55503691066142e-06, + "loss": 0.4756, + "step": 1599 + }, + { + "epoch": 1.1095700416088765, + "grad_norm": 0.5641648416298276, + "learning_rate": 9.554038101679628e-06, + "loss": 0.5176, + "step": 1600 + }, + { + "epoch": 1.110263522884882, + "grad_norm": 0.3626695197552836, + "learning_rate": 9.553038225264288e-06, + "loss": 0.5065, + "step": 1601 + }, + { + "epoch": 1.1109570041608876, + "grad_norm": 0.34254840267303466, + "learning_rate": 9.552037281649762e-06, + "loss": 0.4998, + "step": 1602 + }, + { + "epoch": 1.1116504854368932, + "grad_norm": 0.3717304989283044, + "learning_rate": 9.551035271070665e-06, + "loss": 0.5393, + "step": 1603 + }, + { + "epoch": 1.1123439667128987, + "grad_norm": 0.34927161083359987, + "learning_rate": 9.55003219376186e-06, + "loss": 0.5168, + "step": 1604 + }, + { + "epoch": 1.1130374479889042, + "grad_norm": 0.34127118205935114, + "learning_rate": 9.549028049958462e-06, + "loss": 0.5584, + "step": 1605 + }, + { + "epoch": 1.1137309292649098, + "grad_norm": 0.4156028666764858, + "learning_rate": 9.548022839895833e-06, + "loss": 0.5404, + "step": 1606 + }, + { + "epoch": 1.1144244105409153, + "grad_norm": 0.340186951918076, + "learning_rate": 9.547016563809591e-06, + "loss": 0.4658, + "step": 1607 + }, + { + "epoch": 1.115117891816921, + "grad_norm": 0.40724070020490183, + "learning_rate": 9.546009221935598e-06, + "loss": 0.5181, + "step": 1608 + }, + { + "epoch": 1.1158113730929264, + "grad_norm": 0.35791742169658786, + "learning_rate": 9.545000814509965e-06, + "loss": 0.5236, + "step": 1609 + }, + { + "epoch": 1.116504854368932, + "grad_norm": 0.3301410764367193, + "learning_rate": 9.543991341769057e-06, + "loss": 0.5279, + "step": 1610 + }, + { + "epoch": 1.1171983356449375, + "grad_norm": 0.3421366405858752, + "learning_rate": 9.542980803949489e-06, + "loss": 0.4561, + "step": 1611 + }, + { + "epoch": 1.117891816920943, + "grad_norm": 0.33058199290264134, + "learning_rate": 9.541969201288123e-06, + "loss": 0.4606, + "step": 1612 + }, + { + "epoch": 1.1185852981969486, + "grad_norm": 0.3514146351328263, + "learning_rate": 9.54095653402207e-06, + "loss": 0.5187, + "step": 1613 + }, + { + "epoch": 1.1192787794729542, + "grad_norm": 0.35195486911332874, + "learning_rate": 9.539942802388693e-06, + "loss": 0.4513, + "step": 1614 + }, + { + "epoch": 1.1199722607489597, + "grad_norm": 0.32031468396585744, + "learning_rate": 9.538928006625603e-06, + "loss": 0.4878, + "step": 1615 + }, + { + "epoch": 1.1206657420249653, + "grad_norm": 0.4437305444298725, + "learning_rate": 9.53791214697066e-06, + "loss": 0.5536, + "step": 1616 + }, + { + "epoch": 1.1213592233009708, + "grad_norm": 0.36914826941196477, + "learning_rate": 9.536895223661975e-06, + "loss": 0.5301, + "step": 1617 + }, + { + "epoch": 1.1220527045769764, + "grad_norm": 0.3650776851106986, + "learning_rate": 9.535877236937907e-06, + "loss": 0.4827, + "step": 1618 + }, + { + "epoch": 1.122746185852982, + "grad_norm": 0.31879212019918035, + "learning_rate": 9.534858187037066e-06, + "loss": 0.4679, + "step": 1619 + }, + { + "epoch": 1.1234396671289875, + "grad_norm": 0.3752832099133058, + "learning_rate": 9.533838074198306e-06, + "loss": 0.5697, + "step": 1620 + }, + { + "epoch": 1.124133148404993, + "grad_norm": 0.35436660672541487, + "learning_rate": 9.532816898660742e-06, + "loss": 0.455, + "step": 1621 + }, + { + "epoch": 1.1248266296809986, + "grad_norm": 0.3281861166023363, + "learning_rate": 9.531794660663723e-06, + "loss": 0.4671, + "step": 1622 + }, + { + "epoch": 1.125520110957004, + "grad_norm": 0.3450769304183243, + "learning_rate": 9.530771360446855e-06, + "loss": 0.4539, + "step": 1623 + }, + { + "epoch": 1.1262135922330097, + "grad_norm": 0.3696520913185721, + "learning_rate": 9.529746998249994e-06, + "loss": 0.5584, + "step": 1624 + }, + { + "epoch": 1.1269070735090152, + "grad_norm": 0.3545638529670006, + "learning_rate": 9.528721574313243e-06, + "loss": 0.4959, + "step": 1625 + }, + { + "epoch": 1.1276005547850207, + "grad_norm": 0.5869636382503858, + "learning_rate": 9.527695088876953e-06, + "loss": 0.5272, + "step": 1626 + }, + { + "epoch": 1.1282940360610263, + "grad_norm": 0.3103827400875764, + "learning_rate": 9.526667542181727e-06, + "loss": 0.4534, + "step": 1627 + }, + { + "epoch": 1.1289875173370318, + "grad_norm": 0.3311933927914896, + "learning_rate": 9.52563893446841e-06, + "loss": 0.435, + "step": 1628 + }, + { + "epoch": 1.1296809986130374, + "grad_norm": 0.3685916125398502, + "learning_rate": 9.524609265978105e-06, + "loss": 0.5774, + "step": 1629 + }, + { + "epoch": 1.130374479889043, + "grad_norm": 0.34351945573431236, + "learning_rate": 9.523578536952155e-06, + "loss": 0.5311, + "step": 1630 + }, + { + "epoch": 1.1310679611650485, + "grad_norm": 0.3230604901871847, + "learning_rate": 9.52254674763216e-06, + "loss": 0.5177, + "step": 1631 + }, + { + "epoch": 1.131761442441054, + "grad_norm": 0.3581666442948397, + "learning_rate": 9.521513898259959e-06, + "loss": 0.5009, + "step": 1632 + }, + { + "epoch": 1.1324549237170596, + "grad_norm": 0.4025996584654056, + "learning_rate": 9.520479989077647e-06, + "loss": 0.547, + "step": 1633 + }, + { + "epoch": 1.1331484049930651, + "grad_norm": 0.34798550205148976, + "learning_rate": 9.519445020327566e-06, + "loss": 0.5082, + "step": 1634 + }, + { + "epoch": 1.1338418862690707, + "grad_norm": 0.35477982279694437, + "learning_rate": 9.518408992252301e-06, + "loss": 0.4967, + "step": 1635 + }, + { + "epoch": 1.1345353675450762, + "grad_norm": 0.3357066407497684, + "learning_rate": 9.517371905094695e-06, + "loss": 0.53, + "step": 1636 + }, + { + "epoch": 1.1352288488210818, + "grad_norm": 0.5012240083436882, + "learning_rate": 9.516333759097828e-06, + "loss": 0.5036, + "step": 1637 + }, + { + "epoch": 1.1359223300970873, + "grad_norm": 0.3411087744256892, + "learning_rate": 9.515294554505039e-06, + "loss": 0.5052, + "step": 1638 + }, + { + "epoch": 1.1366158113730929, + "grad_norm": 0.38406127824500824, + "learning_rate": 9.514254291559905e-06, + "loss": 0.4952, + "step": 1639 + }, + { + "epoch": 1.1373092926490984, + "grad_norm": 0.3529097042180847, + "learning_rate": 9.513212970506261e-06, + "loss": 0.5331, + "step": 1640 + }, + { + "epoch": 1.138002773925104, + "grad_norm": 0.3873077740797989, + "learning_rate": 9.512170591588183e-06, + "loss": 0.5676, + "step": 1641 + }, + { + "epoch": 1.1386962552011095, + "grad_norm": 0.3554460301360124, + "learning_rate": 9.511127155049996e-06, + "loss": 0.4469, + "step": 1642 + }, + { + "epoch": 1.139389736477115, + "grad_norm": 0.3465874039356611, + "learning_rate": 9.510082661136274e-06, + "loss": 0.4969, + "step": 1643 + }, + { + "epoch": 1.1400832177531206, + "grad_norm": 0.3495298863506024, + "learning_rate": 9.509037110091843e-06, + "loss": 0.5312, + "step": 1644 + }, + { + "epoch": 1.1407766990291262, + "grad_norm": 0.39680003106953243, + "learning_rate": 9.507990502161769e-06, + "loss": 0.4881, + "step": 1645 + }, + { + "epoch": 1.1414701803051317, + "grad_norm": 0.3135739349410391, + "learning_rate": 9.506942837591366e-06, + "loss": 0.5057, + "step": 1646 + }, + { + "epoch": 1.1421636615811372, + "grad_norm": 0.33564410298737213, + "learning_rate": 9.505894116626205e-06, + "loss": 0.4728, + "step": 1647 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.3249097679538702, + "learning_rate": 9.504844339512096e-06, + "loss": 0.4617, + "step": 1648 + }, + { + "epoch": 1.1435506241331483, + "grad_norm": 0.3559096317268659, + "learning_rate": 9.5037935064951e-06, + "loss": 0.5854, + "step": 1649 + }, + { + "epoch": 1.1442441054091539, + "grad_norm": 0.3414419544457502, + "learning_rate": 9.502741617821524e-06, + "loss": 0.4893, + "step": 1650 + }, + { + "epoch": 1.1449375866851594, + "grad_norm": 0.3430276905742752, + "learning_rate": 9.501688673737924e-06, + "loss": 0.5249, + "step": 1651 + }, + { + "epoch": 1.145631067961165, + "grad_norm": 0.39074907334250514, + "learning_rate": 9.500634674491099e-06, + "loss": 0.483, + "step": 1652 + }, + { + "epoch": 1.1463245492371705, + "grad_norm": 0.3263341157308737, + "learning_rate": 9.499579620328103e-06, + "loss": 0.4886, + "step": 1653 + }, + { + "epoch": 1.147018030513176, + "grad_norm": 0.40195093908502905, + "learning_rate": 9.498523511496231e-06, + "loss": 0.5827, + "step": 1654 + }, + { + "epoch": 1.1477115117891816, + "grad_norm": 0.32395271958038363, + "learning_rate": 9.497466348243028e-06, + "loss": 0.4689, + "step": 1655 + }, + { + "epoch": 1.1484049930651872, + "grad_norm": 0.3605680761959218, + "learning_rate": 9.496408130816286e-06, + "loss": 0.5025, + "step": 1656 + }, + { + "epoch": 1.1490984743411927, + "grad_norm": 0.4199044813112826, + "learning_rate": 9.495348859464042e-06, + "loss": 0.4717, + "step": 1657 + }, + { + "epoch": 1.1497919556171983, + "grad_norm": 0.344715746484264, + "learning_rate": 9.494288534434581e-06, + "loss": 0.4954, + "step": 1658 + }, + { + "epoch": 1.1504854368932038, + "grad_norm": 0.3692774006241896, + "learning_rate": 9.49322715597644e-06, + "loss": 0.5178, + "step": 1659 + }, + { + "epoch": 1.1511789181692094, + "grad_norm": 0.5416229107335282, + "learning_rate": 9.49216472433839e-06, + "loss": 0.486, + "step": 1660 + }, + { + "epoch": 1.151872399445215, + "grad_norm": 0.3312354320124688, + "learning_rate": 9.491101239769466e-06, + "loss": 0.5321, + "step": 1661 + }, + { + "epoch": 1.1525658807212205, + "grad_norm": 0.3311978668364831, + "learning_rate": 9.490036702518937e-06, + "loss": 0.5422, + "step": 1662 + }, + { + "epoch": 1.153259361997226, + "grad_norm": 0.37499695498774954, + "learning_rate": 9.488971112836324e-06, + "loss": 0.5423, + "step": 1663 + }, + { + "epoch": 1.1539528432732316, + "grad_norm": 0.31228912202218406, + "learning_rate": 9.487904470971392e-06, + "loss": 0.504, + "step": 1664 + }, + { + "epoch": 1.154646324549237, + "grad_norm": 0.3276693072440868, + "learning_rate": 9.486836777174154e-06, + "loss": 0.5037, + "step": 1665 + }, + { + "epoch": 1.1553398058252426, + "grad_norm": 0.5790942315327522, + "learning_rate": 9.485768031694872e-06, + "loss": 0.4983, + "step": 1666 + }, + { + "epoch": 1.1560332871012482, + "grad_norm": 0.434311744774958, + "learning_rate": 9.48469823478405e-06, + "loss": 0.5426, + "step": 1667 + }, + { + "epoch": 1.1567267683772537, + "grad_norm": 0.3331090918950169, + "learning_rate": 9.483627386692442e-06, + "loss": 0.4754, + "step": 1668 + }, + { + "epoch": 1.1574202496532593, + "grad_norm": 0.3624095610356531, + "learning_rate": 9.482555487671045e-06, + "loss": 0.4899, + "step": 1669 + }, + { + "epoch": 1.1581137309292648, + "grad_norm": 0.34964383902752577, + "learning_rate": 9.481482537971109e-06, + "loss": 0.5147, + "step": 1670 + }, + { + "epoch": 1.1588072122052704, + "grad_norm": 0.3574242454702927, + "learning_rate": 9.48040853784412e-06, + "loss": 0.5559, + "step": 1671 + }, + { + "epoch": 1.159500693481276, + "grad_norm": 0.34075858644858553, + "learning_rate": 9.47933348754182e-06, + "loss": 0.5179, + "step": 1672 + }, + { + "epoch": 1.1601941747572815, + "grad_norm": 0.36892774082728613, + "learning_rate": 9.478257387316189e-06, + "loss": 0.5387, + "step": 1673 + }, + { + "epoch": 1.160887656033287, + "grad_norm": 0.36299866215146465, + "learning_rate": 9.47718023741946e-06, + "loss": 0.5024, + "step": 1674 + }, + { + "epoch": 1.1615811373092926, + "grad_norm": 0.3401760609141647, + "learning_rate": 9.476102038104112e-06, + "loss": 0.4958, + "step": 1675 + }, + { + "epoch": 1.1622746185852981, + "grad_norm": 0.3821106522236179, + "learning_rate": 9.47502278962286e-06, + "loss": 0.4428, + "step": 1676 + }, + { + "epoch": 1.1629680998613037, + "grad_norm": 0.32656565172819785, + "learning_rate": 9.473942492228676e-06, + "loss": 0.4571, + "step": 1677 + }, + { + "epoch": 1.1636615811373092, + "grad_norm": 0.38063745220229667, + "learning_rate": 9.472861146174777e-06, + "loss": 0.533, + "step": 1678 + }, + { + "epoch": 1.1643550624133148, + "grad_norm": 0.42193005599944505, + "learning_rate": 9.471778751714615e-06, + "loss": 0.5249, + "step": 1679 + }, + { + "epoch": 1.1650485436893203, + "grad_norm": 0.3804509011733188, + "learning_rate": 9.470695309101903e-06, + "loss": 0.5171, + "step": 1680 + }, + { + "epoch": 1.1657420249653259, + "grad_norm": 0.3303163476886524, + "learning_rate": 9.469610818590586e-06, + "loss": 0.4974, + "step": 1681 + }, + { + "epoch": 1.1664355062413314, + "grad_norm": 0.427519321515256, + "learning_rate": 9.468525280434866e-06, + "loss": 0.5826, + "step": 1682 + }, + { + "epoch": 1.167128987517337, + "grad_norm": 0.3345103364148085, + "learning_rate": 9.467438694889181e-06, + "loss": 0.5235, + "step": 1683 + }, + { + "epoch": 1.1678224687933425, + "grad_norm": 0.360892832272255, + "learning_rate": 9.466351062208223e-06, + "loss": 0.5344, + "step": 1684 + }, + { + "epoch": 1.168515950069348, + "grad_norm": 0.3281921420110916, + "learning_rate": 9.465262382646922e-06, + "loss": 0.5117, + "step": 1685 + }, + { + "epoch": 1.1692094313453536, + "grad_norm": 0.3353355636612057, + "learning_rate": 9.464172656460456e-06, + "loss": 0.5201, + "step": 1686 + }, + { + "epoch": 1.1699029126213591, + "grad_norm": 0.3384514348258698, + "learning_rate": 9.463081883904251e-06, + "loss": 0.5106, + "step": 1687 + }, + { + "epoch": 1.1705963938973647, + "grad_norm": 0.31357891074588756, + "learning_rate": 9.461990065233978e-06, + "loss": 0.5352, + "step": 1688 + }, + { + "epoch": 1.1712898751733702, + "grad_norm": 0.3350076853198936, + "learning_rate": 9.460897200705546e-06, + "loss": 0.4853, + "step": 1689 + }, + { + "epoch": 1.1719833564493758, + "grad_norm": 0.4654249770831004, + "learning_rate": 9.459803290575119e-06, + "loss": 0.5408, + "step": 1690 + }, + { + "epoch": 1.1726768377253813, + "grad_norm": 0.33374760497131273, + "learning_rate": 9.458708335099099e-06, + "loss": 0.4823, + "step": 1691 + }, + { + "epoch": 1.1733703190013869, + "grad_norm": 0.33964898430279455, + "learning_rate": 9.457612334534136e-06, + "loss": 0.5307, + "step": 1692 + }, + { + "epoch": 1.1740638002773924, + "grad_norm": 0.34566246311150367, + "learning_rate": 9.456515289137127e-06, + "loss": 0.4628, + "step": 1693 + }, + { + "epoch": 1.174757281553398, + "grad_norm": 0.3586618633106987, + "learning_rate": 9.455417199165209e-06, + "loss": 0.479, + "step": 1694 + }, + { + "epoch": 1.1754507628294035, + "grad_norm": 0.3593559685406414, + "learning_rate": 9.454318064875767e-06, + "loss": 0.474, + "step": 1695 + }, + { + "epoch": 1.176144244105409, + "grad_norm": 0.37699022032545726, + "learning_rate": 9.45321788652643e-06, + "loss": 0.5317, + "step": 1696 + }, + { + "epoch": 1.1768377253814146, + "grad_norm": 0.3583201560438081, + "learning_rate": 9.452116664375072e-06, + "loss": 0.4868, + "step": 1697 + }, + { + "epoch": 1.1775312066574202, + "grad_norm": 0.3683286584518682, + "learning_rate": 9.451014398679814e-06, + "loss": 0.4933, + "step": 1698 + }, + { + "epoch": 1.1782246879334257, + "grad_norm": 0.37985472360003564, + "learning_rate": 9.449911089699015e-06, + "loss": 0.4535, + "step": 1699 + }, + { + "epoch": 1.1789181692094313, + "grad_norm": 0.3534260584391775, + "learning_rate": 9.448806737691285e-06, + "loss": 0.5715, + "step": 1700 + }, + { + "epoch": 1.1796116504854368, + "grad_norm": 0.34369316549359924, + "learning_rate": 9.447701342915473e-06, + "loss": 0.4808, + "step": 1701 + }, + { + "epoch": 1.1803051317614424, + "grad_norm": 0.3341010220537988, + "learning_rate": 9.446594905630682e-06, + "loss": 0.4959, + "step": 1702 + }, + { + "epoch": 1.180998613037448, + "grad_norm": 0.3574491631980831, + "learning_rate": 9.445487426096247e-06, + "loss": 0.4908, + "step": 1703 + }, + { + "epoch": 1.1816920943134535, + "grad_norm": 0.336352118318098, + "learning_rate": 9.444378904571753e-06, + "loss": 0.5053, + "step": 1704 + }, + { + "epoch": 1.182385575589459, + "grad_norm": 0.32712792361743737, + "learning_rate": 9.443269341317034e-06, + "loss": 0.474, + "step": 1705 + }, + { + "epoch": 1.1830790568654646, + "grad_norm": 0.38552992785239587, + "learning_rate": 9.442158736592163e-06, + "loss": 0.4843, + "step": 1706 + }, + { + "epoch": 1.18377253814147, + "grad_norm": 0.37955703265610796, + "learning_rate": 9.441047090657452e-06, + "loss": 0.545, + "step": 1707 + }, + { + "epoch": 1.1844660194174756, + "grad_norm": 0.37868135130282043, + "learning_rate": 9.439934403773468e-06, + "loss": 0.4657, + "step": 1708 + }, + { + "epoch": 1.1851595006934812, + "grad_norm": 0.359046967807064, + "learning_rate": 9.438820676201013e-06, + "loss": 0.4929, + "step": 1709 + }, + { + "epoch": 1.1858529819694867, + "grad_norm": 0.3619491084136992, + "learning_rate": 9.437705908201142e-06, + "loss": 0.4691, + "step": 1710 + }, + { + "epoch": 1.1865464632454923, + "grad_norm": 0.34760598844116564, + "learning_rate": 9.436590100035145e-06, + "loss": 0.5365, + "step": 1711 + }, + { + "epoch": 1.1872399445214978, + "grad_norm": 0.34970273680841074, + "learning_rate": 9.435473251964559e-06, + "loss": 0.5012, + "step": 1712 + }, + { + "epoch": 1.1879334257975034, + "grad_norm": 0.34684043383821284, + "learning_rate": 9.434355364251167e-06, + "loss": 0.5256, + "step": 1713 + }, + { + "epoch": 1.188626907073509, + "grad_norm": 0.3524132031035981, + "learning_rate": 9.43323643715699e-06, + "loss": 0.553, + "step": 1714 + }, + { + "epoch": 1.1893203883495145, + "grad_norm": 0.3307702534918243, + "learning_rate": 9.4321164709443e-06, + "loss": 0.516, + "step": 1715 + }, + { + "epoch": 1.19001386962552, + "grad_norm": 0.36002029927529566, + "learning_rate": 9.43099546587561e-06, + "loss": 0.5012, + "step": 1716 + }, + { + "epoch": 1.1907073509015256, + "grad_norm": 0.331128273417454, + "learning_rate": 9.429873422213673e-06, + "loss": 0.4807, + "step": 1717 + }, + { + "epoch": 1.1914008321775311, + "grad_norm": 0.3899947371381505, + "learning_rate": 9.428750340221488e-06, + "loss": 0.5244, + "step": 1718 + }, + { + "epoch": 1.1920943134535367, + "grad_norm": 0.36816407199272744, + "learning_rate": 9.427626220162298e-06, + "loss": 0.5394, + "step": 1719 + }, + { + "epoch": 1.1927877947295422, + "grad_norm": 0.32023218435108053, + "learning_rate": 9.42650106229959e-06, + "loss": 0.4513, + "step": 1720 + }, + { + "epoch": 1.1934812760055478, + "grad_norm": 0.3773033890248266, + "learning_rate": 9.425374866897088e-06, + "loss": 0.4799, + "step": 1721 + }, + { + "epoch": 1.1941747572815533, + "grad_norm": 0.3288651593328371, + "learning_rate": 9.42424763421877e-06, + "loss": 0.5217, + "step": 1722 + }, + { + "epoch": 1.1948682385575589, + "grad_norm": 0.36004861260648363, + "learning_rate": 9.423119364528848e-06, + "loss": 0.5179, + "step": 1723 + }, + { + "epoch": 1.1955617198335644, + "grad_norm": 0.3488622875642143, + "learning_rate": 9.42199005809178e-06, + "loss": 0.5417, + "step": 1724 + }, + { + "epoch": 1.19625520110957, + "grad_norm": 0.38678204590663706, + "learning_rate": 9.420859715172267e-06, + "loss": 0.5219, + "step": 1725 + }, + { + "epoch": 1.1969486823855755, + "grad_norm": 0.4030026622612167, + "learning_rate": 9.419728336035254e-06, + "loss": 0.5526, + "step": 1726 + }, + { + "epoch": 1.197642163661581, + "grad_norm": 0.3890856940765283, + "learning_rate": 9.41859592094593e-06, + "loss": 0.4808, + "step": 1727 + }, + { + "epoch": 1.1983356449375866, + "grad_norm": 0.3407430238459701, + "learning_rate": 9.417462470169722e-06, + "loss": 0.5091, + "step": 1728 + }, + { + "epoch": 1.1990291262135921, + "grad_norm": 0.33439585047935827, + "learning_rate": 9.416327983972304e-06, + "loss": 0.5866, + "step": 1729 + }, + { + "epoch": 1.1997226074895977, + "grad_norm": 0.3594607665557123, + "learning_rate": 9.415192462619591e-06, + "loss": 0.5438, + "step": 1730 + }, + { + "epoch": 1.2004160887656032, + "grad_norm": 0.34124882578538185, + "learning_rate": 9.414055906377743e-06, + "loss": 0.5278, + "step": 1731 + }, + { + "epoch": 1.2011095700416088, + "grad_norm": 0.31154514796541144, + "learning_rate": 9.412918315513156e-06, + "loss": 0.4986, + "step": 1732 + }, + { + "epoch": 1.2018030513176143, + "grad_norm": 0.5691552778263413, + "learning_rate": 9.411779690292478e-06, + "loss": 0.5235, + "step": 1733 + }, + { + "epoch": 1.2024965325936199, + "grad_norm": 0.33272400114356926, + "learning_rate": 9.41064003098259e-06, + "loss": 0.4806, + "step": 1734 + }, + { + "epoch": 1.2031900138696254, + "grad_norm": 0.31230477933126916, + "learning_rate": 9.409499337850623e-06, + "loss": 0.5402, + "step": 1735 + }, + { + "epoch": 1.203883495145631, + "grad_norm": 0.3839760199541642, + "learning_rate": 9.408357611163945e-06, + "loss": 0.4798, + "step": 1736 + }, + { + "epoch": 1.2045769764216365, + "grad_norm": 0.3644018935729482, + "learning_rate": 9.407214851190172e-06, + "loss": 0.5468, + "step": 1737 + }, + { + "epoch": 1.205270457697642, + "grad_norm": 0.34112571531473196, + "learning_rate": 9.406071058197154e-06, + "loss": 0.506, + "step": 1738 + }, + { + "epoch": 1.2059639389736476, + "grad_norm": 0.32972775762348827, + "learning_rate": 9.404926232452993e-06, + "loss": 0.5189, + "step": 1739 + }, + { + "epoch": 1.2066574202496532, + "grad_norm": 0.3736726803102763, + "learning_rate": 9.403780374226024e-06, + "loss": 0.4819, + "step": 1740 + }, + { + "epoch": 1.2073509015256587, + "grad_norm": 0.34460422716649386, + "learning_rate": 9.402633483784829e-06, + "loss": 0.4662, + "step": 1741 + }, + { + "epoch": 1.2080443828016643, + "grad_norm": 0.35343428807366783, + "learning_rate": 9.40148556139823e-06, + "loss": 0.5337, + "step": 1742 + }, + { + "epoch": 1.2087378640776698, + "grad_norm": 0.3310744447621281, + "learning_rate": 9.400336607335294e-06, + "loss": 0.5169, + "step": 1743 + }, + { + "epoch": 1.2094313453536754, + "grad_norm": 1.0993749520175058, + "learning_rate": 9.399186621865323e-06, + "loss": 0.4987, + "step": 1744 + }, + { + "epoch": 1.210124826629681, + "grad_norm": 0.3787079557675599, + "learning_rate": 9.398035605257871e-06, + "loss": 0.5271, + "step": 1745 + }, + { + "epoch": 1.2108183079056865, + "grad_norm": 0.3283299973065705, + "learning_rate": 9.396883557782726e-06, + "loss": 0.5133, + "step": 1746 + }, + { + "epoch": 1.211511789181692, + "grad_norm": 0.30892901634222936, + "learning_rate": 9.395730479709916e-06, + "loss": 0.5178, + "step": 1747 + }, + { + "epoch": 1.2122052704576975, + "grad_norm": 0.33384867713731503, + "learning_rate": 9.394576371309719e-06, + "loss": 0.4944, + "step": 1748 + }, + { + "epoch": 1.212898751733703, + "grad_norm": 0.3184968769229458, + "learning_rate": 9.393421232852647e-06, + "loss": 0.5243, + "step": 1749 + }, + { + "epoch": 1.2135922330097086, + "grad_norm": 0.32512841113260527, + "learning_rate": 9.392265064609455e-06, + "loss": 0.5185, + "step": 1750 + }, + { + "epoch": 1.2142857142857142, + "grad_norm": 0.3780573473062979, + "learning_rate": 9.391107866851143e-06, + "loss": 0.518, + "step": 1751 + }, + { + "epoch": 1.2149791955617197, + "grad_norm": 0.3764154859820463, + "learning_rate": 9.38994963984895e-06, + "loss": 0.5036, + "step": 1752 + }, + { + "epoch": 1.2156726768377253, + "grad_norm": 0.3459017489085076, + "learning_rate": 9.388790383874354e-06, + "loss": 0.4779, + "step": 1753 + }, + { + "epoch": 1.2163661581137308, + "grad_norm": 0.3733669780271188, + "learning_rate": 9.387630099199078e-06, + "loss": 0.5058, + "step": 1754 + }, + { + "epoch": 1.2170596393897364, + "grad_norm": 0.344433307636627, + "learning_rate": 9.386468786095083e-06, + "loss": 0.521, + "step": 1755 + }, + { + "epoch": 1.217753120665742, + "grad_norm": 0.3474449635046229, + "learning_rate": 9.385306444834573e-06, + "loss": 0.4371, + "step": 1756 + }, + { + "epoch": 1.2184466019417475, + "grad_norm": 0.35089326845114144, + "learning_rate": 9.384143075689992e-06, + "loss": 0.4947, + "step": 1757 + }, + { + "epoch": 1.219140083217753, + "grad_norm": 0.3732377905127223, + "learning_rate": 9.382978678934025e-06, + "loss": 0.5801, + "step": 1758 + }, + { + "epoch": 1.2198335644937588, + "grad_norm": 0.3444938530511972, + "learning_rate": 9.381813254839599e-06, + "loss": 0.543, + "step": 1759 + }, + { + "epoch": 1.2205270457697641, + "grad_norm": 0.3545084942767667, + "learning_rate": 9.38064680367988e-06, + "loss": 0.4803, + "step": 1760 + }, + { + "epoch": 1.2212205270457699, + "grad_norm": 0.3496106779691193, + "learning_rate": 9.379479325728278e-06, + "loss": 0.5073, + "step": 1761 + }, + { + "epoch": 1.2219140083217752, + "grad_norm": 0.3565004982279895, + "learning_rate": 9.378310821258438e-06, + "loss": 0.4367, + "step": 1762 + }, + { + "epoch": 1.222607489597781, + "grad_norm": 0.380005631044401, + "learning_rate": 9.377141290544252e-06, + "loss": 0.5482, + "step": 1763 + }, + { + "epoch": 1.2233009708737863, + "grad_norm": 0.32483113388649515, + "learning_rate": 9.375970733859848e-06, + "loss": 0.4772, + "step": 1764 + }, + { + "epoch": 1.223994452149792, + "grad_norm": 0.34642682526857854, + "learning_rate": 9.374799151479597e-06, + "loss": 0.5607, + "step": 1765 + }, + { + "epoch": 1.2246879334257974, + "grad_norm": 0.3761000577403412, + "learning_rate": 9.373626543678106e-06, + "loss": 0.4968, + "step": 1766 + }, + { + "epoch": 1.2253814147018032, + "grad_norm": 0.38479697912876704, + "learning_rate": 9.37245291073023e-06, + "loss": 0.4979, + "step": 1767 + }, + { + "epoch": 1.2260748959778085, + "grad_norm": 0.384078635865825, + "learning_rate": 9.371278252911061e-06, + "loss": 0.5143, + "step": 1768 + }, + { + "epoch": 1.2267683772538143, + "grad_norm": 0.33940895010505023, + "learning_rate": 9.370102570495925e-06, + "loss": 0.4829, + "step": 1769 + }, + { + "epoch": 1.2274618585298196, + "grad_norm": 0.3754837694351547, + "learning_rate": 9.368925863760396e-06, + "loss": 0.4952, + "step": 1770 + }, + { + "epoch": 1.2281553398058254, + "grad_norm": 0.30986664311154366, + "learning_rate": 9.367748132980286e-06, + "loss": 0.4546, + "step": 1771 + }, + { + "epoch": 1.2288488210818307, + "grad_norm": 0.36817457336350934, + "learning_rate": 9.366569378431647e-06, + "loss": 0.4881, + "step": 1772 + }, + { + "epoch": 1.2295423023578365, + "grad_norm": 0.33131085104720265, + "learning_rate": 9.36538960039077e-06, + "loss": 0.4924, + "step": 1773 + }, + { + "epoch": 1.2302357836338418, + "grad_norm": 0.4382446417776454, + "learning_rate": 9.364208799134187e-06, + "loss": 0.5041, + "step": 1774 + }, + { + "epoch": 1.2309292649098476, + "grad_norm": 0.35821233800222924, + "learning_rate": 9.363026974938667e-06, + "loss": 0.5396, + "step": 1775 + }, + { + "epoch": 1.2316227461858529, + "grad_norm": 0.5342569170911093, + "learning_rate": 9.361844128081224e-06, + "loss": 0.5203, + "step": 1776 + }, + { + "epoch": 1.2323162274618586, + "grad_norm": 0.38002321598260624, + "learning_rate": 9.360660258839105e-06, + "loss": 0.525, + "step": 1777 + }, + { + "epoch": 1.233009708737864, + "grad_norm": 0.7115304619708138, + "learning_rate": 9.359475367489805e-06, + "loss": 0.4508, + "step": 1778 + }, + { + "epoch": 1.2337031900138697, + "grad_norm": 0.39373627900885494, + "learning_rate": 9.35828945431105e-06, + "loss": 0.5039, + "step": 1779 + }, + { + "epoch": 1.234396671289875, + "grad_norm": 0.3542774756046947, + "learning_rate": 9.357102519580814e-06, + "loss": 0.4401, + "step": 1780 + }, + { + "epoch": 1.2350901525658808, + "grad_norm": 0.32112702665284176, + "learning_rate": 9.3559145635773e-06, + "loss": 0.4374, + "step": 1781 + }, + { + "epoch": 1.2357836338418862, + "grad_norm": 0.3585003989544278, + "learning_rate": 9.354725586578961e-06, + "loss": 0.5308, + "step": 1782 + }, + { + "epoch": 1.236477115117892, + "grad_norm": 0.34996791544106826, + "learning_rate": 9.353535588864481e-06, + "loss": 0.534, + "step": 1783 + }, + { + "epoch": 1.2371705963938973, + "grad_norm": 0.35276513540142673, + "learning_rate": 9.35234457071279e-06, + "loss": 0.4946, + "step": 1784 + }, + { + "epoch": 1.237864077669903, + "grad_norm": 0.34501235341728115, + "learning_rate": 9.351152532403054e-06, + "loss": 0.4946, + "step": 1785 + }, + { + "epoch": 1.2385575589459084, + "grad_norm": 0.6636162679288155, + "learning_rate": 9.349959474214677e-06, + "loss": 0.4955, + "step": 1786 + }, + { + "epoch": 1.2392510402219141, + "grad_norm": 0.36482412876196674, + "learning_rate": 9.348765396427301e-06, + "loss": 0.4954, + "step": 1787 + }, + { + "epoch": 1.2399445214979194, + "grad_norm": 0.380783285560688, + "learning_rate": 9.347570299320811e-06, + "loss": 0.5423, + "step": 1788 + }, + { + "epoch": 1.2406380027739252, + "grad_norm": 0.4111420045033692, + "learning_rate": 9.346374183175332e-06, + "loss": 0.5494, + "step": 1789 + }, + { + "epoch": 1.2413314840499305, + "grad_norm": 0.390779385886406, + "learning_rate": 9.34517704827122e-06, + "loss": 0.4949, + "step": 1790 + }, + { + "epoch": 1.2420249653259363, + "grad_norm": 0.4119419521078131, + "learning_rate": 9.34397889488908e-06, + "loss": 0.4751, + "step": 1791 + }, + { + "epoch": 1.2427184466019416, + "grad_norm": 0.36895537173839116, + "learning_rate": 9.342779723309746e-06, + "loss": 0.5126, + "step": 1792 + }, + { + "epoch": 1.2434119278779474, + "grad_norm": 0.3800295326968954, + "learning_rate": 9.341579533814295e-06, + "loss": 0.4806, + "step": 1793 + }, + { + "epoch": 1.2441054091539527, + "grad_norm": 0.38533573980298713, + "learning_rate": 9.340378326684046e-06, + "loss": 0.5453, + "step": 1794 + }, + { + "epoch": 1.2447988904299585, + "grad_norm": 0.32525208351074025, + "learning_rate": 9.339176102200552e-06, + "loss": 0.5033, + "step": 1795 + }, + { + "epoch": 1.2454923717059638, + "grad_norm": 0.35893685303904643, + "learning_rate": 9.337972860645605e-06, + "loss": 0.5081, + "step": 1796 + }, + { + "epoch": 1.2461858529819696, + "grad_norm": 0.3756367763280424, + "learning_rate": 9.336768602301237e-06, + "loss": 0.5728, + "step": 1797 + }, + { + "epoch": 1.246879334257975, + "grad_norm": 0.39848342030640704, + "learning_rate": 9.335563327449717e-06, + "loss": 0.5115, + "step": 1798 + }, + { + "epoch": 1.2475728155339807, + "grad_norm": 0.3332538030512722, + "learning_rate": 9.334357036373552e-06, + "loss": 0.4812, + "step": 1799 + }, + { + "epoch": 1.248266296809986, + "grad_norm": 0.4936305563062856, + "learning_rate": 9.333149729355488e-06, + "loss": 0.5534, + "step": 1800 + }, + { + "epoch": 1.2489597780859918, + "grad_norm": 0.3464077648272386, + "learning_rate": 9.33194140667851e-06, + "loss": 0.4755, + "step": 1801 + }, + { + "epoch": 1.2496532593619971, + "grad_norm": 0.34412226798281487, + "learning_rate": 9.330732068625841e-06, + "loss": 0.472, + "step": 1802 + }, + { + "epoch": 1.2503467406380029, + "grad_norm": 0.40125874252663685, + "learning_rate": 9.32952171548094e-06, + "loss": 0.5316, + "step": 1803 + }, + { + "epoch": 1.2510402219140082, + "grad_norm": 0.3747570794371812, + "learning_rate": 9.328310347527502e-06, + "loss": 0.6068, + "step": 1804 + }, + { + "epoch": 1.251733703190014, + "grad_norm": 0.3519003462584428, + "learning_rate": 9.32709796504947e-06, + "loss": 0.5555, + "step": 1805 + }, + { + "epoch": 1.2524271844660193, + "grad_norm": 0.3616136500041631, + "learning_rate": 9.32588456833101e-06, + "loss": 0.4587, + "step": 1806 + }, + { + "epoch": 1.253120665742025, + "grad_norm": 0.36318051003589935, + "learning_rate": 9.324670157656537e-06, + "loss": 0.4811, + "step": 1807 + }, + { + "epoch": 1.2538141470180304, + "grad_norm": 0.6895559289517076, + "learning_rate": 9.323454733310699e-06, + "loss": 0.4445, + "step": 1808 + }, + { + "epoch": 1.2545076282940362, + "grad_norm": 0.48409487764298625, + "learning_rate": 9.322238295578385e-06, + "loss": 0.5179, + "step": 1809 + }, + { + "epoch": 1.2552011095700415, + "grad_norm": 0.33808201346064753, + "learning_rate": 9.321020844744717e-06, + "loss": 0.4823, + "step": 1810 + }, + { + "epoch": 1.2558945908460473, + "grad_norm": 0.3501876609955232, + "learning_rate": 9.319802381095058e-06, + "loss": 0.487, + "step": 1811 + }, + { + "epoch": 1.2565880721220526, + "grad_norm": 0.39446790540112453, + "learning_rate": 9.318582904915006e-06, + "loss": 0.5297, + "step": 1812 + }, + { + "epoch": 1.2572815533980584, + "grad_norm": 0.3612380146284427, + "learning_rate": 9.317362416490396e-06, + "loss": 0.493, + "step": 1813 + }, + { + "epoch": 1.2579750346740637, + "grad_norm": 0.33438828335321563, + "learning_rate": 9.316140916107305e-06, + "loss": 0.4888, + "step": 1814 + }, + { + "epoch": 1.2586685159500695, + "grad_norm": 0.37737050106701486, + "learning_rate": 9.314918404052043e-06, + "loss": 0.5449, + "step": 1815 + }, + { + "epoch": 1.2593619972260748, + "grad_norm": 0.3148973693528381, + "learning_rate": 9.313694880611157e-06, + "loss": 0.4131, + "step": 1816 + }, + { + "epoch": 1.2600554785020806, + "grad_norm": 0.3470073014150124, + "learning_rate": 9.312470346071432e-06, + "loss": 0.5104, + "step": 1817 + }, + { + "epoch": 1.2607489597780859, + "grad_norm": 0.3371210211191612, + "learning_rate": 9.31124480071989e-06, + "loss": 0.4775, + "step": 1818 + }, + { + "epoch": 1.2614424410540916, + "grad_norm": 0.41493715604100356, + "learning_rate": 9.310018244843789e-06, + "loss": 0.5228, + "step": 1819 + }, + { + "epoch": 1.262135922330097, + "grad_norm": 0.36177471993141525, + "learning_rate": 9.308790678730627e-06, + "loss": 0.5413, + "step": 1820 + }, + { + "epoch": 1.2628294036061027, + "grad_norm": 0.3653922251647071, + "learning_rate": 9.307562102668135e-06, + "loss": 0.4889, + "step": 1821 + }, + { + "epoch": 1.263522884882108, + "grad_norm": 0.36061884106135783, + "learning_rate": 9.306332516944286e-06, + "loss": 0.5054, + "step": 1822 + }, + { + "epoch": 1.2642163661581138, + "grad_norm": 0.33099520054862996, + "learning_rate": 9.30510192184728e-06, + "loss": 0.514, + "step": 1823 + }, + { + "epoch": 1.2649098474341192, + "grad_norm": 0.35230487588580767, + "learning_rate": 9.303870317665562e-06, + "loss": 0.4735, + "step": 1824 + }, + { + "epoch": 1.265603328710125, + "grad_norm": 0.34182709412480217, + "learning_rate": 9.302637704687813e-06, + "loss": 0.5315, + "step": 1825 + }, + { + "epoch": 1.2662968099861303, + "grad_norm": 0.34561666292270243, + "learning_rate": 9.301404083202947e-06, + "loss": 0.5152, + "step": 1826 + }, + { + "epoch": 1.266990291262136, + "grad_norm": 0.34694018822517697, + "learning_rate": 9.300169453500117e-06, + "loss": 0.5084, + "step": 1827 + }, + { + "epoch": 1.2676837725381414, + "grad_norm": 0.3383285721000504, + "learning_rate": 9.29893381586871e-06, + "loss": 0.5365, + "step": 1828 + }, + { + "epoch": 1.2683772538141471, + "grad_norm": 0.32798283121490285, + "learning_rate": 9.29769717059835e-06, + "loss": 0.507, + "step": 1829 + }, + { + "epoch": 1.2690707350901524, + "grad_norm": 0.4242650703398752, + "learning_rate": 9.296459517978898e-06, + "loss": 0.5339, + "step": 1830 + }, + { + "epoch": 1.2697642163661582, + "grad_norm": 0.36550918769480284, + "learning_rate": 9.295220858300454e-06, + "loss": 0.4783, + "step": 1831 + }, + { + "epoch": 1.2704576976421635, + "grad_norm": 0.3393992967884969, + "learning_rate": 9.293981191853345e-06, + "loss": 0.5473, + "step": 1832 + }, + { + "epoch": 1.2711511789181693, + "grad_norm": 0.36297350829327885, + "learning_rate": 9.292740518928145e-06, + "loss": 0.5456, + "step": 1833 + }, + { + "epoch": 1.2718446601941746, + "grad_norm": 0.6438831175708963, + "learning_rate": 9.291498839815658e-06, + "loss": 0.5028, + "step": 1834 + }, + { + "epoch": 1.2725381414701804, + "grad_norm": 0.34934010035959623, + "learning_rate": 9.29025615480692e-06, + "loss": 0.536, + "step": 1835 + }, + { + "epoch": 1.2732316227461857, + "grad_norm": 0.34350905906439205, + "learning_rate": 9.289012464193215e-06, + "loss": 0.5463, + "step": 1836 + }, + { + "epoch": 1.2739251040221915, + "grad_norm": 0.3185676661328429, + "learning_rate": 9.287767768266046e-06, + "loss": 0.4963, + "step": 1837 + }, + { + "epoch": 1.2746185852981968, + "grad_norm": 0.33608499489175475, + "learning_rate": 9.28652206731717e-06, + "loss": 0.5328, + "step": 1838 + }, + { + "epoch": 1.2753120665742026, + "grad_norm": 0.31767013577623654, + "learning_rate": 9.285275361638564e-06, + "loss": 0.4891, + "step": 1839 + }, + { + "epoch": 1.276005547850208, + "grad_norm": 0.3733185568785257, + "learning_rate": 9.284027651522449e-06, + "loss": 0.5227, + "step": 1840 + }, + { + "epoch": 1.2766990291262137, + "grad_norm": 0.363373361322098, + "learning_rate": 9.282778937261279e-06, + "loss": 0.4966, + "step": 1841 + }, + { + "epoch": 1.277392510402219, + "grad_norm": 0.3418047154421495, + "learning_rate": 9.281529219147742e-06, + "loss": 0.5532, + "step": 1842 + }, + { + "epoch": 1.2780859916782248, + "grad_norm": 0.31665406680819047, + "learning_rate": 9.280278497474765e-06, + "loss": 0.4579, + "step": 1843 + }, + { + "epoch": 1.2787794729542301, + "grad_norm": 0.3583116096524031, + "learning_rate": 9.279026772535508e-06, + "loss": 0.4589, + "step": 1844 + }, + { + "epoch": 1.2794729542302359, + "grad_norm": 0.37982949955058526, + "learning_rate": 9.277774044623366e-06, + "loss": 0.5608, + "step": 1845 + }, + { + "epoch": 1.2801664355062412, + "grad_norm": 0.33138853176950483, + "learning_rate": 9.27652031403197e-06, + "loss": 0.5216, + "step": 1846 + }, + { + "epoch": 1.280859916782247, + "grad_norm": 0.33450791406369784, + "learning_rate": 9.275265581055183e-06, + "loss": 0.5004, + "step": 1847 + }, + { + "epoch": 1.2815533980582523, + "grad_norm": 0.3795679929765968, + "learning_rate": 9.274009845987106e-06, + "loss": 0.5585, + "step": 1848 + }, + { + "epoch": 1.282246879334258, + "grad_norm": 0.35512081526603556, + "learning_rate": 9.272753109122077e-06, + "loss": 0.5057, + "step": 1849 + }, + { + "epoch": 1.2829403606102634, + "grad_norm": 0.34003401418927753, + "learning_rate": 9.271495370754661e-06, + "loss": 0.5055, + "step": 1850 + }, + { + "epoch": 1.2836338418862692, + "grad_norm": 0.3528803434944884, + "learning_rate": 9.270236631179667e-06, + "loss": 0.4718, + "step": 1851 + }, + { + "epoch": 1.2843273231622745, + "grad_norm": 0.29720816670752237, + "learning_rate": 9.268976890692136e-06, + "loss": 0.4671, + "step": 1852 + }, + { + "epoch": 1.2850208044382803, + "grad_norm": 0.3499516875454265, + "learning_rate": 9.267716149587336e-06, + "loss": 0.4894, + "step": 1853 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 0.35706546425277286, + "learning_rate": 9.266454408160779e-06, + "loss": 0.4647, + "step": 1854 + }, + { + "epoch": 1.2864077669902914, + "grad_norm": 0.3317602798535959, + "learning_rate": 9.26519166670821e-06, + "loss": 0.4981, + "step": 1855 + }, + { + "epoch": 1.2871012482662967, + "grad_norm": 0.4062212899320207, + "learning_rate": 9.263927925525602e-06, + "loss": 0.4919, + "step": 1856 + }, + { + "epoch": 1.2877947295423025, + "grad_norm": 0.34891518938031596, + "learning_rate": 9.262663184909168e-06, + "loss": 0.5429, + "step": 1857 + }, + { + "epoch": 1.2884882108183078, + "grad_norm": 0.6255808059722502, + "learning_rate": 9.261397445155358e-06, + "loss": 0.4118, + "step": 1858 + }, + { + "epoch": 1.2891816920943135, + "grad_norm": 0.37102991097352156, + "learning_rate": 9.260130706560847e-06, + "loss": 0.5428, + "step": 1859 + }, + { + "epoch": 1.2898751733703189, + "grad_norm": 0.3218548607114772, + "learning_rate": 9.258862969422554e-06, + "loss": 0.4679, + "step": 1860 + }, + { + "epoch": 1.2905686546463246, + "grad_norm": 0.3513152116288358, + "learning_rate": 9.257594234037624e-06, + "loss": 0.488, + "step": 1861 + }, + { + "epoch": 1.29126213592233, + "grad_norm": 0.3787649439476071, + "learning_rate": 9.256324500703439e-06, + "loss": 0.5203, + "step": 1862 + }, + { + "epoch": 1.2919556171983357, + "grad_norm": 0.3579882013479024, + "learning_rate": 9.255053769717618e-06, + "loss": 0.535, + "step": 1863 + }, + { + "epoch": 1.292649098474341, + "grad_norm": 0.386163256949996, + "learning_rate": 9.253782041378012e-06, + "loss": 0.5253, + "step": 1864 + }, + { + "epoch": 1.2933425797503468, + "grad_norm": 0.34260691499067647, + "learning_rate": 9.252509315982701e-06, + "loss": 0.4582, + "step": 1865 + }, + { + "epoch": 1.2940360610263522, + "grad_norm": 0.3326924122711991, + "learning_rate": 9.251235593830003e-06, + "loss": 0.4941, + "step": 1866 + }, + { + "epoch": 1.294729542302358, + "grad_norm": 0.34313143935430807, + "learning_rate": 9.249960875218474e-06, + "loss": 0.5028, + "step": 1867 + }, + { + "epoch": 1.2954230235783633, + "grad_norm": 0.5000172994508971, + "learning_rate": 9.248685160446892e-06, + "loss": 0.4919, + "step": 1868 + }, + { + "epoch": 1.296116504854369, + "grad_norm": 0.3481836528118851, + "learning_rate": 9.247408449814281e-06, + "loss": 0.6002, + "step": 1869 + }, + { + "epoch": 1.2968099861303743, + "grad_norm": 1.2486713064261739, + "learning_rate": 9.24613074361989e-06, + "loss": 0.4673, + "step": 1870 + }, + { + "epoch": 1.2975034674063801, + "grad_norm": 0.35244066709293037, + "learning_rate": 9.244852042163207e-06, + "loss": 0.5167, + "step": 1871 + }, + { + "epoch": 1.2981969486823854, + "grad_norm": 0.3571215715967322, + "learning_rate": 9.243572345743944e-06, + "loss": 0.5173, + "step": 1872 + }, + { + "epoch": 1.2988904299583912, + "grad_norm": 0.3542359960492466, + "learning_rate": 9.24229165466206e-06, + "loss": 0.5332, + "step": 1873 + }, + { + "epoch": 1.2995839112343965, + "grad_norm": 0.34014165933613844, + "learning_rate": 9.241009969217734e-06, + "loss": 0.5264, + "step": 1874 + }, + { + "epoch": 1.3002773925104023, + "grad_norm": 0.33693680377191965, + "learning_rate": 9.239727289711385e-06, + "loss": 0.4559, + "step": 1875 + }, + { + "epoch": 1.3009708737864076, + "grad_norm": 0.33057940031515953, + "learning_rate": 9.238443616443666e-06, + "loss": 0.4736, + "step": 1876 + }, + { + "epoch": 1.3016643550624134, + "grad_norm": 0.4539264723083045, + "learning_rate": 9.237158949715462e-06, + "loss": 0.565, + "step": 1877 + }, + { + "epoch": 1.3023578363384187, + "grad_norm": 0.35282147506975486, + "learning_rate": 9.235873289827883e-06, + "loss": 0.4716, + "step": 1878 + }, + { + "epoch": 1.3030513176144245, + "grad_norm": 0.42154634305474753, + "learning_rate": 9.234586637082285e-06, + "loss": 0.5168, + "step": 1879 + }, + { + "epoch": 1.3037447988904298, + "grad_norm": 0.3817510127645713, + "learning_rate": 9.233298991780247e-06, + "loss": 0.5179, + "step": 1880 + }, + { + "epoch": 1.3044382801664356, + "grad_norm": 0.32170508439422457, + "learning_rate": 9.232010354223584e-06, + "loss": 0.4551, + "step": 1881 + }, + { + "epoch": 1.305131761442441, + "grad_norm": 0.5401977930323866, + "learning_rate": 9.230720724714345e-06, + "loss": 0.4646, + "step": 1882 + }, + { + "epoch": 1.3058252427184467, + "grad_norm": 0.36683850299211856, + "learning_rate": 9.229430103554808e-06, + "loss": 0.4684, + "step": 1883 + }, + { + "epoch": 1.306518723994452, + "grad_norm": 0.3290350115726689, + "learning_rate": 9.228138491047484e-06, + "loss": 0.493, + "step": 1884 + }, + { + "epoch": 1.3072122052704578, + "grad_norm": 0.32829750076246034, + "learning_rate": 9.226845887495121e-06, + "loss": 0.5041, + "step": 1885 + }, + { + "epoch": 1.307905686546463, + "grad_norm": 0.3395242367031229, + "learning_rate": 9.225552293200694e-06, + "loss": 0.431, + "step": 1886 + }, + { + "epoch": 1.3085991678224689, + "grad_norm": 0.34592279759627054, + "learning_rate": 9.224257708467412e-06, + "loss": 0.4945, + "step": 1887 + }, + { + "epoch": 1.3092926490984742, + "grad_norm": 0.35782151715558513, + "learning_rate": 9.222962133598716e-06, + "loss": 0.4849, + "step": 1888 + }, + { + "epoch": 1.30998613037448, + "grad_norm": 0.3582950740376383, + "learning_rate": 9.22166556889828e-06, + "loss": 0.5128, + "step": 1889 + }, + { + "epoch": 1.3106796116504853, + "grad_norm": 0.32662955303731456, + "learning_rate": 9.22036801467001e-06, + "loss": 0.5063, + "step": 1890 + }, + { + "epoch": 1.311373092926491, + "grad_norm": 0.33167950055132195, + "learning_rate": 9.219069471218045e-06, + "loss": 0.5088, + "step": 1891 + }, + { + "epoch": 1.3120665742024964, + "grad_norm": 0.3428562599490065, + "learning_rate": 9.21776993884675e-06, + "loss": 0.523, + "step": 1892 + }, + { + "epoch": 1.3127600554785022, + "grad_norm": 0.31385836415922064, + "learning_rate": 9.216469417860727e-06, + "loss": 0.5093, + "step": 1893 + }, + { + "epoch": 1.3134535367545077, + "grad_norm": 0.3476910529162486, + "learning_rate": 9.215167908564811e-06, + "loss": 0.528, + "step": 1894 + }, + { + "epoch": 1.3141470180305133, + "grad_norm": 0.3509319421764095, + "learning_rate": 9.213865411264063e-06, + "loss": 0.4749, + "step": 1895 + }, + { + "epoch": 1.3148404993065188, + "grad_norm": 0.4085357152406572, + "learning_rate": 9.212561926263783e-06, + "loss": 0.4822, + "step": 1896 + }, + { + "epoch": 1.3155339805825244, + "grad_norm": 0.31470837074324837, + "learning_rate": 9.211257453869495e-06, + "loss": 0.44, + "step": 1897 + }, + { + "epoch": 1.31622746185853, + "grad_norm": 0.346880710192492, + "learning_rate": 9.209951994386959e-06, + "loss": 0.4946, + "step": 1898 + }, + { + "epoch": 1.3169209431345354, + "grad_norm": 0.33761442780969503, + "learning_rate": 9.208645548122166e-06, + "loss": 0.5012, + "step": 1899 + }, + { + "epoch": 1.317614424410541, + "grad_norm": 0.9123401009768137, + "learning_rate": 9.207338115381337e-06, + "loss": 0.5107, + "step": 1900 + }, + { + "epoch": 1.3183079056865465, + "grad_norm": 0.32941756567469493, + "learning_rate": 9.206029696470924e-06, + "loss": 0.4634, + "step": 1901 + }, + { + "epoch": 1.319001386962552, + "grad_norm": 0.4467262343503186, + "learning_rate": 9.204720291697613e-06, + "loss": 0.5188, + "step": 1902 + }, + { + "epoch": 1.3196948682385576, + "grad_norm": 0.3606064256688591, + "learning_rate": 9.203409901368317e-06, + "loss": 0.5293, + "step": 1903 + }, + { + "epoch": 1.3203883495145632, + "grad_norm": 0.34796190640242625, + "learning_rate": 9.202098525790182e-06, + "loss": 0.4966, + "step": 1904 + }, + { + "epoch": 1.3210818307905687, + "grad_norm": 0.3851546692100887, + "learning_rate": 9.200786165270585e-06, + "loss": 0.5016, + "step": 1905 + }, + { + "epoch": 1.3217753120665743, + "grad_norm": 0.34453151168862584, + "learning_rate": 9.199472820117136e-06, + "loss": 0.5305, + "step": 1906 + }, + { + "epoch": 1.3224687933425798, + "grad_norm": 0.34667882515232046, + "learning_rate": 9.198158490637671e-06, + "loss": 0.5328, + "step": 1907 + }, + { + "epoch": 1.3231622746185854, + "grad_norm": 0.3236231532972804, + "learning_rate": 9.196843177140262e-06, + "loss": 0.5044, + "step": 1908 + }, + { + "epoch": 1.323855755894591, + "grad_norm": 0.345112813517985, + "learning_rate": 9.195526879933206e-06, + "loss": 0.5277, + "step": 1909 + }, + { + "epoch": 1.3245492371705965, + "grad_norm": 0.36270086969994875, + "learning_rate": 9.194209599325035e-06, + "loss": 0.4663, + "step": 1910 + }, + { + "epoch": 1.325242718446602, + "grad_norm": 0.3699520050573922, + "learning_rate": 9.192891335624508e-06, + "loss": 0.4745, + "step": 1911 + }, + { + "epoch": 1.3259361997226076, + "grad_norm": 0.3546368208576948, + "learning_rate": 9.19157208914062e-06, + "loss": 0.4745, + "step": 1912 + }, + { + "epoch": 1.3266296809986131, + "grad_norm": 0.33467924594224446, + "learning_rate": 9.19025186018259e-06, + "loss": 0.5437, + "step": 1913 + }, + { + "epoch": 1.3273231622746187, + "grad_norm": 0.33368651009289546, + "learning_rate": 9.188930649059873e-06, + "loss": 0.4745, + "step": 1914 + }, + { + "epoch": 1.3280166435506242, + "grad_norm": 0.3772498370675096, + "learning_rate": 9.18760845608215e-06, + "loss": 0.5473, + "step": 1915 + }, + { + "epoch": 1.3287101248266298, + "grad_norm": 0.34393568999619517, + "learning_rate": 9.186285281559331e-06, + "loss": 0.4749, + "step": 1916 + }, + { + "epoch": 1.3294036061026353, + "grad_norm": 0.3891592970007088, + "learning_rate": 9.18496112580156e-06, + "loss": 0.5428, + "step": 1917 + }, + { + "epoch": 1.3300970873786409, + "grad_norm": 0.3602504345888441, + "learning_rate": 9.183635989119211e-06, + "loss": 0.5104, + "step": 1918 + }, + { + "epoch": 1.3307905686546464, + "grad_norm": 0.3435705254934971, + "learning_rate": 9.182309871822886e-06, + "loss": 0.4898, + "step": 1919 + }, + { + "epoch": 1.331484049930652, + "grad_norm": 0.34073929125777125, + "learning_rate": 9.180982774223416e-06, + "loss": 0.4581, + "step": 1920 + }, + { + "epoch": 1.3321775312066575, + "grad_norm": 0.32966061300631383, + "learning_rate": 9.179654696631865e-06, + "loss": 0.4959, + "step": 1921 + }, + { + "epoch": 1.332871012482663, + "grad_norm": 0.34066439491923833, + "learning_rate": 9.178325639359522e-06, + "loss": 0.546, + "step": 1922 + }, + { + "epoch": 1.3335644937586686, + "grad_norm": 0.3446644332438958, + "learning_rate": 9.176995602717912e-06, + "loss": 0.4348, + "step": 1923 + }, + { + "epoch": 1.3342579750346741, + "grad_norm": 0.38647871059078065, + "learning_rate": 9.175664587018782e-06, + "loss": 0.5249, + "step": 1924 + }, + { + "epoch": 1.3349514563106797, + "grad_norm": 0.350583398344365, + "learning_rate": 9.174332592574115e-06, + "loss": 0.5134, + "step": 1925 + }, + { + "epoch": 1.3356449375866852, + "grad_norm": 0.3456941501029919, + "learning_rate": 9.172999619696118e-06, + "loss": 0.5048, + "step": 1926 + }, + { + "epoch": 1.3363384188626908, + "grad_norm": 0.358925043740068, + "learning_rate": 9.171665668697236e-06, + "loss": 0.5295, + "step": 1927 + }, + { + "epoch": 1.3370319001386963, + "grad_norm": 0.3301735183670066, + "learning_rate": 9.170330739890133e-06, + "loss": 0.5118, + "step": 1928 + }, + { + "epoch": 1.3377253814147019, + "grad_norm": 0.33064501569266713, + "learning_rate": 9.168994833587707e-06, + "loss": 0.4833, + "step": 1929 + }, + { + "epoch": 1.3384188626907074, + "grad_norm": 0.35167935250762944, + "learning_rate": 9.167657950103085e-06, + "loss": 0.5254, + "step": 1930 + }, + { + "epoch": 1.339112343966713, + "grad_norm": 0.32943616423598954, + "learning_rate": 9.166320089749623e-06, + "loss": 0.4531, + "step": 1931 + }, + { + "epoch": 1.3398058252427185, + "grad_norm": 0.36170518515368594, + "learning_rate": 9.164981252840908e-06, + "loss": 0.5155, + "step": 1932 + }, + { + "epoch": 1.340499306518724, + "grad_norm": 0.37713628473972527, + "learning_rate": 9.16364143969075e-06, + "loss": 0.5764, + "step": 1933 + }, + { + "epoch": 1.3411927877947296, + "grad_norm": 0.3434375663522262, + "learning_rate": 9.162300650613192e-06, + "loss": 0.5295, + "step": 1934 + }, + { + "epoch": 1.3418862690707352, + "grad_norm": 0.4135405738431598, + "learning_rate": 9.160958885922508e-06, + "loss": 0.5176, + "step": 1935 + }, + { + "epoch": 1.3425797503467407, + "grad_norm": 0.36227045032667216, + "learning_rate": 9.159616145933194e-06, + "loss": 0.521, + "step": 1936 + }, + { + "epoch": 1.3432732316227463, + "grad_norm": 0.3130254608989982, + "learning_rate": 9.158272430959982e-06, + "loss": 0.4726, + "step": 1937 + }, + { + "epoch": 1.3439667128987518, + "grad_norm": 0.31929662397535824, + "learning_rate": 9.156927741317829e-06, + "loss": 0.5399, + "step": 1938 + }, + { + "epoch": 1.3446601941747574, + "grad_norm": 0.3377022421522008, + "learning_rate": 9.155582077321918e-06, + "loss": 0.4771, + "step": 1939 + }, + { + "epoch": 1.345353675450763, + "grad_norm": 0.3281963594186775, + "learning_rate": 9.154235439287665e-06, + "loss": 0.4835, + "step": 1940 + }, + { + "epoch": 1.3460471567267684, + "grad_norm": 0.3241665844093833, + "learning_rate": 9.152887827530711e-06, + "loss": 0.4743, + "step": 1941 + }, + { + "epoch": 1.346740638002774, + "grad_norm": 0.39032873971384746, + "learning_rate": 9.151539242366926e-06, + "loss": 0.5394, + "step": 1942 + }, + { + "epoch": 1.3474341192787795, + "grad_norm": 0.3414710217148856, + "learning_rate": 9.150189684112412e-06, + "loss": 0.4927, + "step": 1943 + }, + { + "epoch": 1.348127600554785, + "grad_norm": 0.3755657853423203, + "learning_rate": 9.148839153083492e-06, + "loss": 0.5162, + "step": 1944 + }, + { + "epoch": 1.3488210818307906, + "grad_norm": 0.3665213335101388, + "learning_rate": 9.14748764959672e-06, + "loss": 0.4832, + "step": 1945 + }, + { + "epoch": 1.3495145631067962, + "grad_norm": 0.3353798881557397, + "learning_rate": 9.146135173968881e-06, + "loss": 0.4796, + "step": 1946 + }, + { + "epoch": 1.3502080443828017, + "grad_norm": 0.3715049523272947, + "learning_rate": 9.144781726516987e-06, + "loss": 0.5228, + "step": 1947 + }, + { + "epoch": 1.3509015256588073, + "grad_norm": 0.3681059833547073, + "learning_rate": 9.143427307558273e-06, + "loss": 0.5248, + "step": 1948 + }, + { + "epoch": 1.3515950069348128, + "grad_norm": 0.3943594985391834, + "learning_rate": 9.142071917410205e-06, + "loss": 0.5, + "step": 1949 + }, + { + "epoch": 1.3522884882108184, + "grad_norm": 0.3159122587189733, + "learning_rate": 9.140715556390478e-06, + "loss": 0.4616, + "step": 1950 + }, + { + "epoch": 1.352981969486824, + "grad_norm": 0.36864890400438305, + "learning_rate": 9.139358224817014e-06, + "loss": 0.5548, + "step": 1951 + }, + { + "epoch": 1.3536754507628295, + "grad_norm": 0.4466662649295465, + "learning_rate": 9.13799992300796e-06, + "loss": 0.4906, + "step": 1952 + }, + { + "epoch": 1.354368932038835, + "grad_norm": 0.34155954537718386, + "learning_rate": 9.136640651281694e-06, + "loss": 0.5071, + "step": 1953 + }, + { + "epoch": 1.3550624133148406, + "grad_norm": 0.3410588498849418, + "learning_rate": 9.135280409956819e-06, + "loss": 0.4705, + "step": 1954 + }, + { + "epoch": 1.3557558945908461, + "grad_norm": 0.35503814912701037, + "learning_rate": 9.133919199352163e-06, + "loss": 0.5441, + "step": 1955 + }, + { + "epoch": 1.3564493758668517, + "grad_norm": 0.3535272880958023, + "learning_rate": 9.132557019786788e-06, + "loss": 0.5011, + "step": 1956 + }, + { + "epoch": 1.3571428571428572, + "grad_norm": 0.4086155883737188, + "learning_rate": 9.131193871579975e-06, + "loss": 0.5543, + "step": 1957 + }, + { + "epoch": 1.3578363384188628, + "grad_norm": 0.430477822463212, + "learning_rate": 9.129829755051239e-06, + "loss": 0.4904, + "step": 1958 + }, + { + "epoch": 1.3585298196948683, + "grad_norm": 0.3314910774056916, + "learning_rate": 9.128464670520318e-06, + "loss": 0.4956, + "step": 1959 + }, + { + "epoch": 1.3592233009708738, + "grad_norm": 0.35154731657033544, + "learning_rate": 9.127098618307177e-06, + "loss": 0.5145, + "step": 1960 + }, + { + "epoch": 1.3599167822468794, + "grad_norm": 0.36388159699529904, + "learning_rate": 9.125731598732011e-06, + "loss": 0.5233, + "step": 1961 + }, + { + "epoch": 1.360610263522885, + "grad_norm": 0.36800106644061464, + "learning_rate": 9.124363612115236e-06, + "loss": 0.5569, + "step": 1962 + }, + { + "epoch": 1.3613037447988905, + "grad_norm": 0.34092547978000565, + "learning_rate": 9.122994658777504e-06, + "loss": 0.4814, + "step": 1963 + }, + { + "epoch": 1.361997226074896, + "grad_norm": 0.32932084798226996, + "learning_rate": 9.121624739039682e-06, + "loss": 0.5132, + "step": 1964 + }, + { + "epoch": 1.3626907073509016, + "grad_norm": 0.33372224323166977, + "learning_rate": 9.120253853222872e-06, + "loss": 0.5647, + "step": 1965 + }, + { + "epoch": 1.3633841886269071, + "grad_norm": 0.39150113814683085, + "learning_rate": 9.118882001648398e-06, + "loss": 0.5084, + "step": 1966 + }, + { + "epoch": 1.3640776699029127, + "grad_norm": 0.34411873662988157, + "learning_rate": 9.117509184637814e-06, + "loss": 0.4616, + "step": 1967 + }, + { + "epoch": 1.3647711511789182, + "grad_norm": 0.3288103030226968, + "learning_rate": 9.116135402512897e-06, + "loss": 0.5063, + "step": 1968 + }, + { + "epoch": 1.3654646324549238, + "grad_norm": 0.405824720854079, + "learning_rate": 9.114760655595653e-06, + "loss": 0.4789, + "step": 1969 + }, + { + "epoch": 1.3661581137309293, + "grad_norm": 0.36204030828621414, + "learning_rate": 9.11338494420831e-06, + "loss": 0.5153, + "step": 1970 + }, + { + "epoch": 1.3668515950069349, + "grad_norm": 0.35661344896539776, + "learning_rate": 9.112008268673329e-06, + "loss": 0.5357, + "step": 1971 + }, + { + "epoch": 1.3675450762829404, + "grad_norm": 0.3356112356485565, + "learning_rate": 9.110630629313388e-06, + "loss": 0.4943, + "step": 1972 + }, + { + "epoch": 1.368238557558946, + "grad_norm": 0.37632531231512184, + "learning_rate": 9.1092520264514e-06, + "loss": 0.5494, + "step": 1973 + }, + { + "epoch": 1.3689320388349515, + "grad_norm": 0.3968694416479254, + "learning_rate": 9.107872460410496e-06, + "loss": 0.5375, + "step": 1974 + }, + { + "epoch": 1.369625520110957, + "grad_norm": 0.3210852092298961, + "learning_rate": 9.10649193151404e-06, + "loss": 0.4835, + "step": 1975 + }, + { + "epoch": 1.3703190013869626, + "grad_norm": 0.3609337315216403, + "learning_rate": 9.105110440085613e-06, + "loss": 0.5464, + "step": 1976 + }, + { + "epoch": 1.3710124826629682, + "grad_norm": 0.36357706506404275, + "learning_rate": 9.103727986449034e-06, + "loss": 0.5759, + "step": 1977 + }, + { + "epoch": 1.3717059639389737, + "grad_norm": 0.35258802728545247, + "learning_rate": 9.102344570928333e-06, + "loss": 0.5178, + "step": 1978 + }, + { + "epoch": 1.3723994452149793, + "grad_norm": 0.35930011648447896, + "learning_rate": 9.100960193847773e-06, + "loss": 0.6151, + "step": 1979 + }, + { + "epoch": 1.3730929264909848, + "grad_norm": 0.3487670030211412, + "learning_rate": 9.099574855531846e-06, + "loss": 0.5149, + "step": 1980 + }, + { + "epoch": 1.3737864077669903, + "grad_norm": 0.9869632462397623, + "learning_rate": 9.098188556305262e-06, + "loss": 0.4822, + "step": 1981 + }, + { + "epoch": 1.374479889042996, + "grad_norm": 0.31863533357197305, + "learning_rate": 9.096801296492963e-06, + "loss": 0.4841, + "step": 1982 + }, + { + "epoch": 1.3751733703190014, + "grad_norm": 0.4822400161903467, + "learning_rate": 9.09541307642011e-06, + "loss": 0.5083, + "step": 1983 + }, + { + "epoch": 1.375866851595007, + "grad_norm": 0.37115373959780606, + "learning_rate": 9.094023896412092e-06, + "loss": 0.4785, + "step": 1984 + }, + { + "epoch": 1.3765603328710125, + "grad_norm": 0.32645115093592375, + "learning_rate": 9.092633756794523e-06, + "loss": 0.522, + "step": 1985 + }, + { + "epoch": 1.377253814147018, + "grad_norm": 0.3164366936010976, + "learning_rate": 9.091242657893241e-06, + "loss": 0.4712, + "step": 1986 + }, + { + "epoch": 1.3779472954230236, + "grad_norm": 0.3753907910898338, + "learning_rate": 9.089850600034312e-06, + "loss": 0.499, + "step": 1987 + }, + { + "epoch": 1.3786407766990292, + "grad_norm": 0.3460678887561886, + "learning_rate": 9.088457583544022e-06, + "loss": 0.5596, + "step": 1988 + }, + { + "epoch": 1.3793342579750347, + "grad_norm": 0.33646652029208507, + "learning_rate": 9.087063608748883e-06, + "loss": 0.4688, + "step": 1989 + }, + { + "epoch": 1.3800277392510403, + "grad_norm": 0.4227406159344678, + "learning_rate": 9.085668675975634e-06, + "loss": 0.4758, + "step": 1990 + }, + { + "epoch": 1.3807212205270458, + "grad_norm": 0.3469319880400181, + "learning_rate": 9.084272785551237e-06, + "loss": 0.4735, + "step": 1991 + }, + { + "epoch": 1.3814147018030514, + "grad_norm": 0.35593313436819635, + "learning_rate": 9.08287593780288e-06, + "loss": 0.5332, + "step": 1992 + }, + { + "epoch": 1.382108183079057, + "grad_norm": 0.3536191263984846, + "learning_rate": 9.081478133057972e-06, + "loss": 0.4984, + "step": 1993 + }, + { + "epoch": 1.3828016643550625, + "grad_norm": 0.3413760268111103, + "learning_rate": 9.080079371644151e-06, + "loss": 0.4519, + "step": 1994 + }, + { + "epoch": 1.383495145631068, + "grad_norm": 0.3477983386054444, + "learning_rate": 9.078679653889273e-06, + "loss": 0.505, + "step": 1995 + }, + { + "epoch": 1.3841886269070736, + "grad_norm": 0.3388010557099065, + "learning_rate": 9.077278980121422e-06, + "loss": 0.4826, + "step": 1996 + }, + { + "epoch": 1.384882108183079, + "grad_norm": 0.3615316941958222, + "learning_rate": 9.075877350668909e-06, + "loss": 0.5495, + "step": 1997 + }, + { + "epoch": 1.3855755894590847, + "grad_norm": 0.3420360781245435, + "learning_rate": 9.074474765860264e-06, + "loss": 0.5217, + "step": 1998 + }, + { + "epoch": 1.3862690707350902, + "grad_norm": 0.32317076294353375, + "learning_rate": 9.073071226024242e-06, + "loss": 0.5058, + "step": 1999 + }, + { + "epoch": 1.3869625520110958, + "grad_norm": 0.3851948084166722, + "learning_rate": 9.071666731489824e-06, + "loss": 0.5253, + "step": 2000 + }, + { + "epoch": 1.3876560332871013, + "grad_norm": 0.4180566489080604, + "learning_rate": 9.07026128258621e-06, + "loss": 0.4451, + "step": 2001 + }, + { + "epoch": 1.3883495145631068, + "grad_norm": 0.3559935952517109, + "learning_rate": 9.068854879642833e-06, + "loss": 0.5685, + "step": 2002 + }, + { + "epoch": 1.3890429958391124, + "grad_norm": 0.3286178778917271, + "learning_rate": 9.067447522989337e-06, + "loss": 0.4819, + "step": 2003 + }, + { + "epoch": 1.389736477115118, + "grad_norm": 0.3612689827673842, + "learning_rate": 9.066039212955602e-06, + "loss": 0.5175, + "step": 2004 + }, + { + "epoch": 1.3904299583911235, + "grad_norm": 0.3552749797383618, + "learning_rate": 9.064629949871721e-06, + "loss": 0.4727, + "step": 2005 + }, + { + "epoch": 1.391123439667129, + "grad_norm": 0.36342615888636715, + "learning_rate": 9.063219734068019e-06, + "loss": 0.4836, + "step": 2006 + }, + { + "epoch": 1.3918169209431346, + "grad_norm": 0.36055358488933875, + "learning_rate": 9.061808565875037e-06, + "loss": 0.4703, + "step": 2007 + }, + { + "epoch": 1.3925104022191401, + "grad_norm": 0.34473756432631464, + "learning_rate": 9.060396445623545e-06, + "loss": 0.505, + "step": 2008 + }, + { + "epoch": 1.3932038834951457, + "grad_norm": 0.33222195880954425, + "learning_rate": 9.058983373644532e-06, + "loss": 0.4928, + "step": 2009 + }, + { + "epoch": 1.3938973647711512, + "grad_norm": 0.36215219367300794, + "learning_rate": 9.057569350269214e-06, + "loss": 0.576, + "step": 2010 + }, + { + "epoch": 1.3945908460471568, + "grad_norm": 0.4219491000516108, + "learning_rate": 9.056154375829028e-06, + "loss": 0.5195, + "step": 2011 + }, + { + "epoch": 1.3952843273231623, + "grad_norm": 0.32138636461949377, + "learning_rate": 9.054738450655628e-06, + "loss": 0.5442, + "step": 2012 + }, + { + "epoch": 1.3959778085991679, + "grad_norm": 0.35071835083037484, + "learning_rate": 9.053321575080905e-06, + "loss": 0.4999, + "step": 2013 + }, + { + "epoch": 1.3966712898751734, + "grad_norm": 0.3287371478947435, + "learning_rate": 9.05190374943696e-06, + "loss": 0.4832, + "step": 2014 + }, + { + "epoch": 1.397364771151179, + "grad_norm": 0.3470611829870114, + "learning_rate": 9.05048497405612e-06, + "loss": 0.4977, + "step": 2015 + }, + { + "epoch": 1.3980582524271845, + "grad_norm": 0.32023025568043867, + "learning_rate": 9.049065249270936e-06, + "loss": 0.5207, + "step": 2016 + }, + { + "epoch": 1.39875173370319, + "grad_norm": 0.3395651298847561, + "learning_rate": 9.047644575414184e-06, + "loss": 0.5021, + "step": 2017 + }, + { + "epoch": 1.3994452149791956, + "grad_norm": 0.36578043964741436, + "learning_rate": 9.046222952818857e-06, + "loss": 0.5643, + "step": 2018 + }, + { + "epoch": 1.4001386962552012, + "grad_norm": 0.30457827608908566, + "learning_rate": 9.044800381818175e-06, + "loss": 0.4992, + "step": 2019 + }, + { + "epoch": 1.4008321775312067, + "grad_norm": 0.33976922886508304, + "learning_rate": 9.043376862745576e-06, + "loss": 0.5138, + "step": 2020 + }, + { + "epoch": 1.4015256588072122, + "grad_norm": 0.37214816766432446, + "learning_rate": 9.041952395934726e-06, + "loss": 0.4518, + "step": 2021 + }, + { + "epoch": 1.4022191400832178, + "grad_norm": 0.3845961163640592, + "learning_rate": 9.040526981719506e-06, + "loss": 0.4607, + "step": 2022 + }, + { + "epoch": 1.4029126213592233, + "grad_norm": 0.3676275315409118, + "learning_rate": 9.039100620434025e-06, + "loss": 0.5147, + "step": 2023 + }, + { + "epoch": 1.403606102635229, + "grad_norm": 0.3062267445478142, + "learning_rate": 9.03767331241261e-06, + "loss": 0.4654, + "step": 2024 + }, + { + "epoch": 1.4042995839112344, + "grad_norm": 0.35555930078364206, + "learning_rate": 9.036245057989815e-06, + "loss": 0.458, + "step": 2025 + }, + { + "epoch": 1.40499306518724, + "grad_norm": 0.325210467347809, + "learning_rate": 9.034815857500407e-06, + "loss": 0.5059, + "step": 2026 + }, + { + "epoch": 1.4056865464632455, + "grad_norm": 0.35884101622999165, + "learning_rate": 9.033385711279385e-06, + "loss": 0.5423, + "step": 2027 + }, + { + "epoch": 1.406380027739251, + "grad_norm": 0.35037320776031283, + "learning_rate": 9.031954619661964e-06, + "loss": 0.477, + "step": 2028 + }, + { + "epoch": 1.4070735090152566, + "grad_norm": 0.30423510507148654, + "learning_rate": 9.030522582983582e-06, + "loss": 0.4304, + "step": 2029 + }, + { + "epoch": 1.4077669902912622, + "grad_norm": 0.31991677056009316, + "learning_rate": 9.029089601579895e-06, + "loss": 0.4998, + "step": 2030 + }, + { + "epoch": 1.4084604715672677, + "grad_norm": 0.3470277841469619, + "learning_rate": 9.027655675786785e-06, + "loss": 0.5473, + "step": 2031 + }, + { + "epoch": 1.4091539528432733, + "grad_norm": 0.31084928616723073, + "learning_rate": 9.026220805940357e-06, + "loss": 0.4927, + "step": 2032 + }, + { + "epoch": 1.4098474341192788, + "grad_norm": 0.32211321657284153, + "learning_rate": 9.02478499237693e-06, + "loss": 0.5132, + "step": 2033 + }, + { + "epoch": 1.4105409153952844, + "grad_norm": 0.38960526198219364, + "learning_rate": 9.02334823543305e-06, + "loss": 0.5028, + "step": 2034 + }, + { + "epoch": 1.41123439667129, + "grad_norm": 0.30185956613060205, + "learning_rate": 9.021910535445479e-06, + "loss": 0.437, + "step": 2035 + }, + { + "epoch": 1.4119278779472955, + "grad_norm": 0.5097190134626377, + "learning_rate": 9.02047189275121e-06, + "loss": 0.5201, + "step": 2036 + }, + { + "epoch": 1.412621359223301, + "grad_norm": 0.330140909546037, + "learning_rate": 9.019032307687446e-06, + "loss": 0.4561, + "step": 2037 + }, + { + "epoch": 1.4133148404993066, + "grad_norm": 0.3640389309441148, + "learning_rate": 9.017591780591615e-06, + "loss": 0.5578, + "step": 2038 + }, + { + "epoch": 1.414008321775312, + "grad_norm": 0.3519252082018009, + "learning_rate": 9.016150311801371e-06, + "loss": 0.4984, + "step": 2039 + }, + { + "epoch": 1.4147018030513177, + "grad_norm": 0.3069686446379406, + "learning_rate": 9.014707901654576e-06, + "loss": 0.407, + "step": 2040 + }, + { + "epoch": 1.4153952843273232, + "grad_norm": 0.34196917299982177, + "learning_rate": 9.013264550489327e-06, + "loss": 0.579, + "step": 2041 + }, + { + "epoch": 1.4160887656033287, + "grad_norm": 0.35132469722852405, + "learning_rate": 9.011820258643933e-06, + "loss": 0.5231, + "step": 2042 + }, + { + "epoch": 1.4167822468793343, + "grad_norm": 0.33909092335381796, + "learning_rate": 9.010375026456923e-06, + "loss": 0.4898, + "step": 2043 + }, + { + "epoch": 1.4174757281553398, + "grad_norm": 0.3568708820304695, + "learning_rate": 9.008928854267054e-06, + "loss": 0.4565, + "step": 2044 + }, + { + "epoch": 1.4181692094313454, + "grad_norm": 0.3615298783462584, + "learning_rate": 9.007481742413295e-06, + "loss": 0.5035, + "step": 2045 + }, + { + "epoch": 1.418862690707351, + "grad_norm": 0.32615725363710346, + "learning_rate": 9.006033691234838e-06, + "loss": 0.4762, + "step": 2046 + }, + { + "epoch": 1.4195561719833565, + "grad_norm": 0.3361013448137094, + "learning_rate": 9.004584701071098e-06, + "loss": 0.5041, + "step": 2047 + }, + { + "epoch": 1.420249653259362, + "grad_norm": 0.3318446360127158, + "learning_rate": 9.003134772261705e-06, + "loss": 0.5036, + "step": 2048 + }, + { + "epoch": 1.4209431345353676, + "grad_norm": 0.36039462613717194, + "learning_rate": 9.001683905146516e-06, + "loss": 0.5307, + "step": 2049 + }, + { + "epoch": 1.4216366158113731, + "grad_norm": 0.3726157396910557, + "learning_rate": 9.000232100065599e-06, + "loss": 0.4882, + "step": 2050 + }, + { + "epoch": 1.4223300970873787, + "grad_norm": 0.32280840980165915, + "learning_rate": 8.99877935735925e-06, + "loss": 0.4782, + "step": 2051 + }, + { + "epoch": 1.4230235783633842, + "grad_norm": 0.3725354963452832, + "learning_rate": 8.99732567736798e-06, + "loss": 0.4809, + "step": 2052 + }, + { + "epoch": 1.4237170596393898, + "grad_norm": 0.3634630137709549, + "learning_rate": 8.99587106043252e-06, + "loss": 0.4983, + "step": 2053 + }, + { + "epoch": 1.4244105409153953, + "grad_norm": 0.37109924576279596, + "learning_rate": 8.994415506893824e-06, + "loss": 0.5249, + "step": 2054 + }, + { + "epoch": 1.4251040221914009, + "grad_norm": 0.32474529487445025, + "learning_rate": 8.992959017093062e-06, + "loss": 0.4616, + "step": 2055 + }, + { + "epoch": 1.4257975034674064, + "grad_norm": 0.3323036851505102, + "learning_rate": 8.991501591371625e-06, + "loss": 0.5263, + "step": 2056 + }, + { + "epoch": 1.426490984743412, + "grad_norm": 0.36395942534364123, + "learning_rate": 8.990043230071123e-06, + "loss": 0.5053, + "step": 2057 + }, + { + "epoch": 1.4271844660194175, + "grad_norm": 0.33019473310497166, + "learning_rate": 8.988583933533384e-06, + "loss": 0.5055, + "step": 2058 + }, + { + "epoch": 1.427877947295423, + "grad_norm": 0.35570573889246276, + "learning_rate": 8.987123702100459e-06, + "loss": 0.498, + "step": 2059 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.35527667461959495, + "learning_rate": 8.985662536114614e-06, + "loss": 0.5337, + "step": 2060 + }, + { + "epoch": 1.4292649098474342, + "grad_norm": 0.33569270892402076, + "learning_rate": 8.984200435918335e-06, + "loss": 0.4616, + "step": 2061 + }, + { + "epoch": 1.4299583911234397, + "grad_norm": 0.38513224867004137, + "learning_rate": 8.982737401854328e-06, + "loss": 0.5027, + "step": 2062 + }, + { + "epoch": 1.4306518723994452, + "grad_norm": 0.3376653705404227, + "learning_rate": 8.981273434265521e-06, + "loss": 0.532, + "step": 2063 + }, + { + "epoch": 1.4313453536754508, + "grad_norm": 0.33072082850503065, + "learning_rate": 8.979808533495054e-06, + "loss": 0.4822, + "step": 2064 + }, + { + "epoch": 1.4320388349514563, + "grad_norm": 0.3614107118048633, + "learning_rate": 8.978342699886289e-06, + "loss": 0.4828, + "step": 2065 + }, + { + "epoch": 1.4327323162274619, + "grad_norm": 0.3237837634048971, + "learning_rate": 8.976875933782808e-06, + "loss": 0.472, + "step": 2066 + }, + { + "epoch": 1.4334257975034674, + "grad_norm": 0.3836504827557471, + "learning_rate": 8.97540823552841e-06, + "loss": 0.5415, + "step": 2067 + }, + { + "epoch": 1.434119278779473, + "grad_norm": 0.3386868314164909, + "learning_rate": 8.973939605467112e-06, + "loss": 0.4822, + "step": 2068 + }, + { + "epoch": 1.4348127600554785, + "grad_norm": 0.3279036246414576, + "learning_rate": 8.972470043943153e-06, + "loss": 0.4811, + "step": 2069 + }, + { + "epoch": 1.435506241331484, + "grad_norm": 0.357983401347262, + "learning_rate": 8.970999551300985e-06, + "loss": 0.5094, + "step": 2070 + }, + { + "epoch": 1.4361997226074896, + "grad_norm": 0.33008142807332214, + "learning_rate": 8.969528127885281e-06, + "loss": 0.4911, + "step": 2071 + }, + { + "epoch": 1.4368932038834952, + "grad_norm": 0.32539185770423046, + "learning_rate": 8.968055774040932e-06, + "loss": 0.535, + "step": 2072 + }, + { + "epoch": 1.4375866851595007, + "grad_norm": 0.39822619448159824, + "learning_rate": 8.966582490113049e-06, + "loss": 0.4831, + "step": 2073 + }, + { + "epoch": 1.4382801664355063, + "grad_norm": 0.33787571084119516, + "learning_rate": 8.965108276446956e-06, + "loss": 0.4439, + "step": 2074 + }, + { + "epoch": 1.4389736477115118, + "grad_norm": 0.332140043882061, + "learning_rate": 8.963633133388201e-06, + "loss": 0.4741, + "step": 2075 + }, + { + "epoch": 1.4396671289875174, + "grad_norm": 0.3418478839502107, + "learning_rate": 8.962157061282545e-06, + "loss": 0.5358, + "step": 2076 + }, + { + "epoch": 1.440360610263523, + "grad_norm": 0.32405471861893115, + "learning_rate": 8.96068006047597e-06, + "loss": 0.4444, + "step": 2077 + }, + { + "epoch": 1.4410540915395285, + "grad_norm": 0.41591462290526815, + "learning_rate": 8.959202131314672e-06, + "loss": 0.4977, + "step": 2078 + }, + { + "epoch": 1.441747572815534, + "grad_norm": 0.35991599715141853, + "learning_rate": 8.95772327414507e-06, + "loss": 0.5205, + "step": 2079 + }, + { + "epoch": 1.4424410540915396, + "grad_norm": 0.35078008153840906, + "learning_rate": 8.956243489313795e-06, + "loss": 0.4837, + "step": 2080 + }, + { + "epoch": 1.443134535367545, + "grad_norm": 0.36087599320853947, + "learning_rate": 8.954762777167697e-06, + "loss": 0.567, + "step": 2081 + }, + { + "epoch": 1.4438280166435506, + "grad_norm": 0.3305749845675022, + "learning_rate": 8.953281138053847e-06, + "loss": 0.4989, + "step": 2082 + }, + { + "epoch": 1.4445214979195562, + "grad_norm": 0.617152688711202, + "learning_rate": 8.95179857231953e-06, + "loss": 0.5462, + "step": 2083 + }, + { + "epoch": 1.4452149791955617, + "grad_norm": 0.45083498726989446, + "learning_rate": 8.950315080312246e-06, + "loss": 0.5485, + "step": 2084 + }, + { + "epoch": 1.4459084604715673, + "grad_norm": 0.3172409370500703, + "learning_rate": 8.948830662379717e-06, + "loss": 0.4968, + "step": 2085 + }, + { + "epoch": 1.4466019417475728, + "grad_norm": 0.33435860902312314, + "learning_rate": 8.947345318869883e-06, + "loss": 0.5809, + "step": 2086 + }, + { + "epoch": 1.4472954230235784, + "grad_norm": 0.34240526523562725, + "learning_rate": 8.94585905013089e-06, + "loss": 0.4777, + "step": 2087 + }, + { + "epoch": 1.447988904299584, + "grad_norm": 0.34760879718399595, + "learning_rate": 8.944371856511116e-06, + "loss": 0.4491, + "step": 2088 + }, + { + "epoch": 1.4486823855755895, + "grad_norm": 0.3189560363141416, + "learning_rate": 8.942883738359142e-06, + "loss": 0.5456, + "step": 2089 + }, + { + "epoch": 1.449375866851595, + "grad_norm": 0.3170983540269611, + "learning_rate": 8.941394696023779e-06, + "loss": 0.4565, + "step": 2090 + }, + { + "epoch": 1.4500693481276006, + "grad_norm": 0.33882475327031664, + "learning_rate": 8.939904729854042e-06, + "loss": 0.4521, + "step": 2091 + }, + { + "epoch": 1.4507628294036061, + "grad_norm": 0.34335086382401686, + "learning_rate": 8.93841384019917e-06, + "loss": 0.5064, + "step": 2092 + }, + { + "epoch": 1.4514563106796117, + "grad_norm": 0.33692361629012535, + "learning_rate": 8.936922027408618e-06, + "loss": 0.4659, + "step": 2093 + }, + { + "epoch": 1.4521497919556172, + "grad_norm": 0.33470829041958633, + "learning_rate": 8.935429291832056e-06, + "loss": 0.5368, + "step": 2094 + }, + { + "epoch": 1.4528432732316228, + "grad_norm": 0.30851002647805853, + "learning_rate": 8.933935633819369e-06, + "loss": 0.4258, + "step": 2095 + }, + { + "epoch": 1.4535367545076283, + "grad_norm": 0.3594190765928513, + "learning_rate": 8.93244105372066e-06, + "loss": 0.4932, + "step": 2096 + }, + { + "epoch": 1.4542302357836339, + "grad_norm": 0.3693326429186167, + "learning_rate": 8.930945551886249e-06, + "loss": 0.4843, + "step": 2097 + }, + { + "epoch": 1.4549237170596394, + "grad_norm": 0.3797760938509028, + "learning_rate": 8.92944912866667e-06, + "loss": 0.4876, + "step": 2098 + }, + { + "epoch": 1.455617198335645, + "grad_norm": 0.34168379646376157, + "learning_rate": 8.927951784412673e-06, + "loss": 0.4963, + "step": 2099 + }, + { + "epoch": 1.4563106796116505, + "grad_norm": 0.30037460374802394, + "learning_rate": 8.926453519475225e-06, + "loss": 0.403, + "step": 2100 + }, + { + "epoch": 1.457004160887656, + "grad_norm": 0.367983397258792, + "learning_rate": 8.924954334205509e-06, + "loss": 0.5647, + "step": 2101 + }, + { + "epoch": 1.4576976421636616, + "grad_norm": 0.3220638697647383, + "learning_rate": 8.923454228954924e-06, + "loss": 0.4745, + "step": 2102 + }, + { + "epoch": 1.4583911234396671, + "grad_norm": 0.3623342305949139, + "learning_rate": 8.92195320407508e-06, + "loss": 0.52, + "step": 2103 + }, + { + "epoch": 1.4590846047156727, + "grad_norm": 0.41548187399238445, + "learning_rate": 8.920451259917813e-06, + "loss": 0.5143, + "step": 2104 + }, + { + "epoch": 1.4597780859916782, + "grad_norm": 0.3386943994206969, + "learning_rate": 8.918948396835161e-06, + "loss": 0.5242, + "step": 2105 + }, + { + "epoch": 1.4604715672676838, + "grad_norm": 0.335730677770548, + "learning_rate": 8.917444615179386e-06, + "loss": 0.4774, + "step": 2106 + }, + { + "epoch": 1.4611650485436893, + "grad_norm": 0.37515893735896927, + "learning_rate": 8.91593991530297e-06, + "loss": 0.5748, + "step": 2107 + }, + { + "epoch": 1.4618585298196949, + "grad_norm": 0.3623649893228703, + "learning_rate": 8.914434297558594e-06, + "loss": 0.5299, + "step": 2108 + }, + { + "epoch": 1.4625520110957004, + "grad_norm": 0.34828253504299667, + "learning_rate": 8.912927762299169e-06, + "loss": 0.5234, + "step": 2109 + }, + { + "epoch": 1.463245492371706, + "grad_norm": 0.3579084360582346, + "learning_rate": 8.911420309877816e-06, + "loss": 0.4325, + "step": 2110 + }, + { + "epoch": 1.4639389736477115, + "grad_norm": 0.3471295356989196, + "learning_rate": 8.909911940647868e-06, + "loss": 0.4906, + "step": 2111 + }, + { + "epoch": 1.464632454923717, + "grad_norm": 0.33132240937688123, + "learning_rate": 8.90840265496288e-06, + "loss": 0.5101, + "step": 2112 + }, + { + "epoch": 1.4653259361997226, + "grad_norm": 0.8576544345183823, + "learning_rate": 8.906892453176617e-06, + "loss": 0.5016, + "step": 2113 + }, + { + "epoch": 1.4660194174757282, + "grad_norm": 0.6223028504780337, + "learning_rate": 8.905381335643056e-06, + "loss": 0.4905, + "step": 2114 + }, + { + "epoch": 1.4667128987517337, + "grad_norm": 0.34079976651899996, + "learning_rate": 8.903869302716395e-06, + "loss": 0.5331, + "step": 2115 + }, + { + "epoch": 1.4674063800277393, + "grad_norm": 0.35387099693512847, + "learning_rate": 8.902356354751042e-06, + "loss": 0.5002, + "step": 2116 + }, + { + "epoch": 1.4680998613037448, + "grad_norm": 0.33390282585804015, + "learning_rate": 8.900842492101622e-06, + "loss": 0.4827, + "step": 2117 + }, + { + "epoch": 1.4687933425797504, + "grad_norm": 0.3484679866645666, + "learning_rate": 8.899327715122972e-06, + "loss": 0.4649, + "step": 2118 + }, + { + "epoch": 1.469486823855756, + "grad_norm": 0.35427941169862176, + "learning_rate": 8.897812024170147e-06, + "loss": 0.5253, + "step": 2119 + }, + { + "epoch": 1.4701803051317615, + "grad_norm": 0.3814576109457133, + "learning_rate": 8.896295419598412e-06, + "loss": 0.5234, + "step": 2120 + }, + { + "epoch": 1.470873786407767, + "grad_norm": 0.3388632778408937, + "learning_rate": 8.89477790176325e-06, + "loss": 0.4988, + "step": 2121 + }, + { + "epoch": 1.4715672676837726, + "grad_norm": 0.3681405474214004, + "learning_rate": 8.893259471020354e-06, + "loss": 0.5191, + "step": 2122 + }, + { + "epoch": 1.472260748959778, + "grad_norm": 0.32775445125977226, + "learning_rate": 8.891740127725634e-06, + "loss": 0.5023, + "step": 2123 + }, + { + "epoch": 1.4729542302357836, + "grad_norm": 0.3457111298038547, + "learning_rate": 8.890219872235215e-06, + "loss": 0.5326, + "step": 2124 + }, + { + "epoch": 1.4736477115117892, + "grad_norm": 0.32876607089486615, + "learning_rate": 8.888698704905431e-06, + "loss": 0.475, + "step": 2125 + }, + { + "epoch": 1.4743411927877947, + "grad_norm": 0.36121090069085415, + "learning_rate": 8.887176626092836e-06, + "loss": 0.519, + "step": 2126 + }, + { + "epoch": 1.4750346740638003, + "grad_norm": 0.3294304857532917, + "learning_rate": 8.88565363615419e-06, + "loss": 0.5223, + "step": 2127 + }, + { + "epoch": 1.4757281553398058, + "grad_norm": 0.3381813421678541, + "learning_rate": 8.884129735446471e-06, + "loss": 0.5335, + "step": 2128 + }, + { + "epoch": 1.4764216366158114, + "grad_norm": 0.32304364687073783, + "learning_rate": 8.882604924326877e-06, + "loss": 0.4938, + "step": 2129 + }, + { + "epoch": 1.477115117891817, + "grad_norm": 0.3299954486263169, + "learning_rate": 8.881079203152805e-06, + "loss": 0.5033, + "step": 2130 + }, + { + "epoch": 1.4778085991678225, + "grad_norm": 0.3515561596055821, + "learning_rate": 8.879552572281876e-06, + "loss": 0.5462, + "step": 2131 + }, + { + "epoch": 1.478502080443828, + "grad_norm": 0.3576475327911327, + "learning_rate": 8.878025032071922e-06, + "loss": 0.504, + "step": 2132 + }, + { + "epoch": 1.4791955617198336, + "grad_norm": 0.3399071238302747, + "learning_rate": 8.876496582880984e-06, + "loss": 0.4852, + "step": 2133 + }, + { + "epoch": 1.4798890429958391, + "grad_norm": 0.3581013682860097, + "learning_rate": 8.874967225067325e-06, + "loss": 0.498, + "step": 2134 + }, + { + "epoch": 1.4805825242718447, + "grad_norm": 0.35887746174050233, + "learning_rate": 8.873436958989409e-06, + "loss": 0.524, + "step": 2135 + }, + { + "epoch": 1.4812760055478502, + "grad_norm": 0.369093070770657, + "learning_rate": 8.871905785005925e-06, + "loss": 0.562, + "step": 2136 + }, + { + "epoch": 1.4819694868238558, + "grad_norm": 0.3327975410271189, + "learning_rate": 8.870373703475767e-06, + "loss": 0.475, + "step": 2137 + }, + { + "epoch": 1.4826629680998613, + "grad_norm": 0.34585276926972686, + "learning_rate": 8.868840714758043e-06, + "loss": 0.4356, + "step": 2138 + }, + { + "epoch": 1.4833564493758669, + "grad_norm": 0.3431727946927612, + "learning_rate": 8.867306819212074e-06, + "loss": 0.5257, + "step": 2139 + }, + { + "epoch": 1.4840499306518724, + "grad_norm": 0.31659786173406046, + "learning_rate": 8.865772017197395e-06, + "loss": 0.4091, + "step": 2140 + }, + { + "epoch": 1.484743411927878, + "grad_norm": 0.3289203283433317, + "learning_rate": 8.864236309073753e-06, + "loss": 0.446, + "step": 2141 + }, + { + "epoch": 1.4854368932038835, + "grad_norm": 0.34570866516093024, + "learning_rate": 8.862699695201107e-06, + "loss": 0.455, + "step": 2142 + }, + { + "epoch": 1.486130374479889, + "grad_norm": 0.3243325705552734, + "learning_rate": 8.861162175939626e-06, + "loss": 0.4854, + "step": 2143 + }, + { + "epoch": 1.4868238557558946, + "grad_norm": 0.4633649442916241, + "learning_rate": 8.859623751649696e-06, + "loss": 0.4723, + "step": 2144 + }, + { + "epoch": 1.4875173370319001, + "grad_norm": 0.3783059857256445, + "learning_rate": 8.858084422691911e-06, + "loss": 0.55, + "step": 2145 + }, + { + "epoch": 1.4882108183079057, + "grad_norm": 0.32371798362830867, + "learning_rate": 8.856544189427078e-06, + "loss": 0.4608, + "step": 2146 + }, + { + "epoch": 1.4889042995839112, + "grad_norm": 0.35498144049642766, + "learning_rate": 8.855003052216219e-06, + "loss": 0.5067, + "step": 2147 + }, + { + "epoch": 1.4895977808599168, + "grad_norm": 0.3158213403639081, + "learning_rate": 8.853461011420563e-06, + "loss": 0.4768, + "step": 2148 + }, + { + "epoch": 1.4902912621359223, + "grad_norm": 0.38794064014236784, + "learning_rate": 8.851918067401552e-06, + "loss": 0.5308, + "step": 2149 + }, + { + "epoch": 1.4909847434119279, + "grad_norm": 0.39633647521149623, + "learning_rate": 8.850374220520845e-06, + "loss": 0.5074, + "step": 2150 + }, + { + "epoch": 1.4916782246879334, + "grad_norm": 0.3486306393934178, + "learning_rate": 8.848829471140308e-06, + "loss": 0.4879, + "step": 2151 + }, + { + "epoch": 1.492371705963939, + "grad_norm": 0.4664071746463165, + "learning_rate": 8.847283819622015e-06, + "loss": 0.5047, + "step": 2152 + }, + { + "epoch": 1.4930651872399445, + "grad_norm": 0.32723494962213, + "learning_rate": 8.845737266328258e-06, + "loss": 0.49, + "step": 2153 + }, + { + "epoch": 1.49375866851595, + "grad_norm": 0.3251236319193131, + "learning_rate": 8.84418981162154e-06, + "loss": 0.4836, + "step": 2154 + }, + { + "epoch": 1.4944521497919556, + "grad_norm": 0.3583769316924854, + "learning_rate": 8.842641455864568e-06, + "loss": 0.5113, + "step": 2155 + }, + { + "epoch": 1.4951456310679612, + "grad_norm": 0.4026476547769237, + "learning_rate": 8.84109219942027e-06, + "loss": 0.5318, + "step": 2156 + }, + { + "epoch": 1.4958391123439667, + "grad_norm": 0.3867651071790106, + "learning_rate": 8.83954204265178e-06, + "loss": 0.454, + "step": 2157 + }, + { + "epoch": 1.4965325936199723, + "grad_norm": 0.360269646116235, + "learning_rate": 8.837990985922442e-06, + "loss": 0.5314, + "step": 2158 + }, + { + "epoch": 1.4972260748959778, + "grad_norm": 0.342018217831643, + "learning_rate": 8.836439029595811e-06, + "loss": 0.4687, + "step": 2159 + }, + { + "epoch": 1.4979195561719834, + "grad_norm": 0.343738873338028, + "learning_rate": 8.83488617403566e-06, + "loss": 0.482, + "step": 2160 + }, + { + "epoch": 1.498613037447989, + "grad_norm": 0.34365928684064706, + "learning_rate": 8.83333241960596e-06, + "loss": 0.5088, + "step": 2161 + }, + { + "epoch": 1.4993065187239945, + "grad_norm": 0.32521792862418825, + "learning_rate": 8.831777766670904e-06, + "loss": 0.4986, + "step": 2162 + }, + { + "epoch": 1.5, + "grad_norm": 0.3372337645649222, + "learning_rate": 8.83022221559489e-06, + "loss": 0.4881, + "step": 2163 + }, + { + "epoch": 1.5006934812760055, + "grad_norm": 0.3338903724583596, + "learning_rate": 8.82866576674253e-06, + "loss": 0.5232, + "step": 2164 + }, + { + "epoch": 1.501386962552011, + "grad_norm": 0.34114613890599516, + "learning_rate": 8.827108420478643e-06, + "loss": 0.5049, + "step": 2165 + }, + { + "epoch": 1.5020804438280166, + "grad_norm": 0.3574385595787303, + "learning_rate": 8.825550177168258e-06, + "loss": 0.5112, + "step": 2166 + }, + { + "epoch": 1.5027739251040222, + "grad_norm": 0.3591715625156868, + "learning_rate": 8.823991037176618e-06, + "loss": 0.5137, + "step": 2167 + }, + { + "epoch": 1.5034674063800277, + "grad_norm": 0.2982555515327498, + "learning_rate": 8.822431000869173e-06, + "loss": 0.4842, + "step": 2168 + }, + { + "epoch": 1.5041608876560333, + "grad_norm": 0.343208379987108, + "learning_rate": 8.820870068611585e-06, + "loss": 0.5075, + "step": 2169 + }, + { + "epoch": 1.5048543689320388, + "grad_norm": 0.3358606227046012, + "learning_rate": 8.819308240769726e-06, + "loss": 0.5765, + "step": 2170 + }, + { + "epoch": 1.5055478502080444, + "grad_norm": 0.357421882435548, + "learning_rate": 8.817745517709675e-06, + "loss": 0.4963, + "step": 2171 + }, + { + "epoch": 1.50624133148405, + "grad_norm": 0.3151770170586487, + "learning_rate": 8.816181899797725e-06, + "loss": 0.5054, + "step": 2172 + }, + { + "epoch": 1.5069348127600555, + "grad_norm": 0.36371559438674267, + "learning_rate": 8.814617387400373e-06, + "loss": 0.5025, + "step": 2173 + }, + { + "epoch": 1.507628294036061, + "grad_norm": 0.3395056390621181, + "learning_rate": 8.813051980884336e-06, + "loss": 0.4751, + "step": 2174 + }, + { + "epoch": 1.5083217753120666, + "grad_norm": 0.3420029494780606, + "learning_rate": 8.811485680616527e-06, + "loss": 0.4887, + "step": 2175 + }, + { + "epoch": 1.5090152565880721, + "grad_norm": 0.3508653123781212, + "learning_rate": 8.809918486964079e-06, + "loss": 0.5213, + "step": 2176 + }, + { + "epoch": 1.5097087378640777, + "grad_norm": 0.4725204458538169, + "learning_rate": 8.808350400294332e-06, + "loss": 0.5489, + "step": 2177 + }, + { + "epoch": 1.5104022191400832, + "grad_norm": 0.36416513504755693, + "learning_rate": 8.806781420974832e-06, + "loss": 0.515, + "step": 2178 + }, + { + "epoch": 1.5110957004160888, + "grad_norm": 0.343763042212301, + "learning_rate": 8.805211549373335e-06, + "loss": 0.5338, + "step": 2179 + }, + { + "epoch": 1.5117891816920943, + "grad_norm": 0.6018582584105034, + "learning_rate": 8.803640785857811e-06, + "loss": 0.4995, + "step": 2180 + }, + { + "epoch": 1.5124826629680999, + "grad_norm": 0.3578014597280583, + "learning_rate": 8.802069130796436e-06, + "loss": 0.5633, + "step": 2181 + }, + { + "epoch": 1.5131761442441054, + "grad_norm": 0.3319497961863665, + "learning_rate": 8.80049658455759e-06, + "loss": 0.4817, + "step": 2182 + }, + { + "epoch": 1.513869625520111, + "grad_norm": 0.3657315763626466, + "learning_rate": 8.79892314750987e-06, + "loss": 0.4595, + "step": 2183 + }, + { + "epoch": 1.5145631067961165, + "grad_norm": 0.3494130129852575, + "learning_rate": 8.797348820022079e-06, + "loss": 0.5178, + "step": 2184 + }, + { + "epoch": 1.515256588072122, + "grad_norm": 0.31213717070599123, + "learning_rate": 8.795773602463223e-06, + "loss": 0.4673, + "step": 2185 + }, + { + "epoch": 1.5159500693481276, + "grad_norm": 0.3680008443335597, + "learning_rate": 8.794197495202525e-06, + "loss": 0.5085, + "step": 2186 + }, + { + "epoch": 1.5166435506241331, + "grad_norm": 0.3483466342828103, + "learning_rate": 8.792620498609416e-06, + "loss": 0.5171, + "step": 2187 + }, + { + "epoch": 1.5173370319001387, + "grad_norm": 0.3156290430000329, + "learning_rate": 8.791042613053527e-06, + "loss": 0.4657, + "step": 2188 + }, + { + "epoch": 1.5180305131761442, + "grad_norm": 0.3558287728094569, + "learning_rate": 8.789463838904707e-06, + "loss": 0.4955, + "step": 2189 + }, + { + "epoch": 1.5187239944521498, + "grad_norm": 0.3613488978910829, + "learning_rate": 8.787884176533007e-06, + "loss": 0.5479, + "step": 2190 + }, + { + "epoch": 1.5194174757281553, + "grad_norm": 0.3490092046960001, + "learning_rate": 8.78630362630869e-06, + "loss": 0.4182, + "step": 2191 + }, + { + "epoch": 1.5201109570041609, + "grad_norm": 0.332937402859545, + "learning_rate": 8.784722188602224e-06, + "loss": 0.5469, + "step": 2192 + }, + { + "epoch": 1.5208044382801664, + "grad_norm": 0.35683842696023627, + "learning_rate": 8.783139863784287e-06, + "loss": 0.5153, + "step": 2193 + }, + { + "epoch": 1.521497919556172, + "grad_norm": 0.37203079646420295, + "learning_rate": 8.781556652225765e-06, + "loss": 0.5426, + "step": 2194 + }, + { + "epoch": 1.5221914008321775, + "grad_norm": 0.33946884040932535, + "learning_rate": 8.779972554297752e-06, + "loss": 0.5234, + "step": 2195 + }, + { + "epoch": 1.522884882108183, + "grad_norm": 0.40931857078688355, + "learning_rate": 8.778387570371544e-06, + "loss": 0.5656, + "step": 2196 + }, + { + "epoch": 1.5235783633841886, + "grad_norm": 0.3415686478395844, + "learning_rate": 8.776801700818658e-06, + "loss": 0.5285, + "step": 2197 + }, + { + "epoch": 1.5242718446601942, + "grad_norm": 0.3390355908928575, + "learning_rate": 8.775214946010806e-06, + "loss": 0.486, + "step": 2198 + }, + { + "epoch": 1.5249653259361997, + "grad_norm": 0.37005348586484604, + "learning_rate": 8.773627306319912e-06, + "loss": 0.4675, + "step": 2199 + }, + { + "epoch": 1.5256588072122053, + "grad_norm": 0.3403610720395017, + "learning_rate": 8.772038782118106e-06, + "loss": 0.4876, + "step": 2200 + }, + { + "epoch": 1.5263522884882108, + "grad_norm": 0.32314497663843095, + "learning_rate": 8.770449373777729e-06, + "loss": 0.4879, + "step": 2201 + }, + { + "epoch": 1.5270457697642164, + "grad_norm": 0.33696459431857195, + "learning_rate": 8.768859081671323e-06, + "loss": 0.5044, + "step": 2202 + }, + { + "epoch": 1.527739251040222, + "grad_norm": 0.33474549090149996, + "learning_rate": 8.767267906171647e-06, + "loss": 0.4893, + "step": 2203 + }, + { + "epoch": 1.5284327323162274, + "grad_norm": 0.35168303363530057, + "learning_rate": 8.765675847651655e-06, + "loss": 0.4831, + "step": 2204 + }, + { + "epoch": 1.529126213592233, + "grad_norm": 0.33683092114089147, + "learning_rate": 8.764082906484518e-06, + "loss": 0.4908, + "step": 2205 + }, + { + "epoch": 1.5298196948682385, + "grad_norm": 0.3427384026255094, + "learning_rate": 8.76248908304361e-06, + "loss": 0.4932, + "step": 2206 + }, + { + "epoch": 1.530513176144244, + "grad_norm": 0.3365800012479881, + "learning_rate": 8.760894377702508e-06, + "loss": 0.5293, + "step": 2207 + }, + { + "epoch": 1.5312066574202496, + "grad_norm": 0.32274601620790755, + "learning_rate": 8.759298790835002e-06, + "loss": 0.4612, + "step": 2208 + }, + { + "epoch": 1.5319001386962552, + "grad_norm": 0.5669181402136003, + "learning_rate": 8.757702322815086e-06, + "loss": 0.5368, + "step": 2209 + }, + { + "epoch": 1.5325936199722607, + "grad_norm": 0.3433752220386639, + "learning_rate": 8.756104974016959e-06, + "loss": 0.4821, + "step": 2210 + }, + { + "epoch": 1.5332871012482663, + "grad_norm": 0.332519921079417, + "learning_rate": 8.754506744815031e-06, + "loss": 0.4661, + "step": 2211 + }, + { + "epoch": 1.5339805825242718, + "grad_norm": 0.3640908473928857, + "learning_rate": 8.752907635583911e-06, + "loss": 0.5831, + "step": 2212 + }, + { + "epoch": 1.5346740638002774, + "grad_norm": 0.3528664268276014, + "learning_rate": 8.751307646698423e-06, + "loss": 0.5935, + "step": 2213 + }, + { + "epoch": 1.535367545076283, + "grad_norm": 0.3262030255599233, + "learning_rate": 8.74970677853359e-06, + "loss": 0.4768, + "step": 2214 + }, + { + "epoch": 1.5360610263522885, + "grad_norm": 0.3654045691376196, + "learning_rate": 8.748105031464644e-06, + "loss": 0.4644, + "step": 2215 + }, + { + "epoch": 1.536754507628294, + "grad_norm": 0.32899870313543844, + "learning_rate": 8.746502405867025e-06, + "loss": 0.5342, + "step": 2216 + }, + { + "epoch": 1.5374479889042996, + "grad_norm": 0.327544087264539, + "learning_rate": 8.744898902116375e-06, + "loss": 0.4568, + "step": 2217 + }, + { + "epoch": 1.5381414701803051, + "grad_norm": 0.32970059808450525, + "learning_rate": 8.743294520588545e-06, + "loss": 0.4778, + "step": 2218 + }, + { + "epoch": 1.5388349514563107, + "grad_norm": 0.34241561151168054, + "learning_rate": 8.74168926165959e-06, + "loss": 0.5069, + "step": 2219 + }, + { + "epoch": 1.5395284327323162, + "grad_norm": 0.38376640762900927, + "learning_rate": 8.740083125705769e-06, + "loss": 0.5319, + "step": 2220 + }, + { + "epoch": 1.5402219140083218, + "grad_norm": 0.34473123025809793, + "learning_rate": 8.738476113103551e-06, + "loss": 0.4952, + "step": 2221 + }, + { + "epoch": 1.5409153952843273, + "grad_norm": 0.33219211908462165, + "learning_rate": 8.736868224229606e-06, + "loss": 0.5298, + "step": 2222 + }, + { + "epoch": 1.5416088765603329, + "grad_norm": 0.3975438898102467, + "learning_rate": 8.735259459460813e-06, + "loss": 0.5781, + "step": 2223 + }, + { + "epoch": 1.5423023578363384, + "grad_norm": 0.34285171530903735, + "learning_rate": 8.733649819174257e-06, + "loss": 0.5105, + "step": 2224 + }, + { + "epoch": 1.542995839112344, + "grad_norm": 0.3444838753241053, + "learning_rate": 8.732039303747223e-06, + "loss": 0.5485, + "step": 2225 + }, + { + "epoch": 1.5436893203883495, + "grad_norm": 0.4314287733836552, + "learning_rate": 8.730427913557205e-06, + "loss": 0.4907, + "step": 2226 + }, + { + "epoch": 1.544382801664355, + "grad_norm": 0.3625342086114008, + "learning_rate": 8.7288156489819e-06, + "loss": 0.5203, + "step": 2227 + }, + { + "epoch": 1.5450762829403606, + "grad_norm": 0.375773964504486, + "learning_rate": 8.727202510399213e-06, + "loss": 0.4542, + "step": 2228 + }, + { + "epoch": 1.5457697642163661, + "grad_norm": 0.3790830361898822, + "learning_rate": 8.725588498187251e-06, + "loss": 0.5421, + "step": 2229 + }, + { + "epoch": 1.5464632454923717, + "grad_norm": 0.33287718148572004, + "learning_rate": 8.723973612724328e-06, + "loss": 0.524, + "step": 2230 + }, + { + "epoch": 1.5471567267683772, + "grad_norm": 0.3190052764715288, + "learning_rate": 8.722357854388958e-06, + "loss": 0.4854, + "step": 2231 + }, + { + "epoch": 1.5478502080443828, + "grad_norm": 0.32989629940199827, + "learning_rate": 8.720741223559867e-06, + "loss": 0.5089, + "step": 2232 + }, + { + "epoch": 1.5485436893203883, + "grad_norm": 0.3553590645382289, + "learning_rate": 8.71912372061598e-06, + "loss": 0.4597, + "step": 2233 + }, + { + "epoch": 1.5492371705963939, + "grad_norm": 0.3444501626690935, + "learning_rate": 8.71750534593643e-06, + "loss": 0.5094, + "step": 2234 + }, + { + "epoch": 1.5499306518723994, + "grad_norm": 0.362346575247488, + "learning_rate": 8.715886099900547e-06, + "loss": 0.4871, + "step": 2235 + }, + { + "epoch": 1.550624133148405, + "grad_norm": 0.32275597228122355, + "learning_rate": 8.714265982887875e-06, + "loss": 0.4799, + "step": 2236 + }, + { + "epoch": 1.5513176144244105, + "grad_norm": 0.3321687215820013, + "learning_rate": 8.712644995278157e-06, + "loss": 0.489, + "step": 2237 + }, + { + "epoch": 1.552011095700416, + "grad_norm": 0.3230531682078885, + "learning_rate": 8.711023137451343e-06, + "loss": 0.4948, + "step": 2238 + }, + { + "epoch": 1.5527045769764216, + "grad_norm": 0.33731908056730714, + "learning_rate": 8.709400409787579e-06, + "loss": 0.551, + "step": 2239 + }, + { + "epoch": 1.5533980582524272, + "grad_norm": 0.35789621588271175, + "learning_rate": 8.707776812667224e-06, + "loss": 0.5029, + "step": 2240 + }, + { + "epoch": 1.5540915395284327, + "grad_norm": 0.34244349742755764, + "learning_rate": 8.706152346470836e-06, + "loss": 0.4838, + "step": 2241 + }, + { + "epoch": 1.5547850208044383, + "grad_norm": 0.3735191834740914, + "learning_rate": 8.704527011579181e-06, + "loss": 0.5116, + "step": 2242 + }, + { + "epoch": 1.5554785020804438, + "grad_norm": 0.3371303277672469, + "learning_rate": 8.702900808373223e-06, + "loss": 0.5015, + "step": 2243 + }, + { + "epoch": 1.5561719833564494, + "grad_norm": 0.3790896432244701, + "learning_rate": 8.701273737234133e-06, + "loss": 0.4674, + "step": 2244 + }, + { + "epoch": 1.556865464632455, + "grad_norm": 0.33844643340669506, + "learning_rate": 8.699645798543286e-06, + "loss": 0.5107, + "step": 2245 + }, + { + "epoch": 1.5575589459084604, + "grad_norm": 0.34770428537631803, + "learning_rate": 8.698016992682257e-06, + "loss": 0.4917, + "step": 2246 + }, + { + "epoch": 1.558252427184466, + "grad_norm": 0.3245168369391238, + "learning_rate": 8.696387320032827e-06, + "loss": 0.4983, + "step": 2247 + }, + { + "epoch": 1.5589459084604715, + "grad_norm": 0.3332521517448146, + "learning_rate": 8.694756780976981e-06, + "loss": 0.5166, + "step": 2248 + }, + { + "epoch": 1.559639389736477, + "grad_norm": 0.4033705446210668, + "learning_rate": 8.693125375896903e-06, + "loss": 0.528, + "step": 2249 + }, + { + "epoch": 1.5603328710124826, + "grad_norm": 0.30375975024798835, + "learning_rate": 8.691493105174984e-06, + "loss": 0.4417, + "step": 2250 + }, + { + "epoch": 1.5610263522884882, + "grad_norm": 0.32536134105180453, + "learning_rate": 8.689859969193817e-06, + "loss": 0.4784, + "step": 2251 + }, + { + "epoch": 1.5617198335644937, + "grad_norm": 0.3569204969265695, + "learning_rate": 8.688225968336196e-06, + "loss": 0.4372, + "step": 2252 + }, + { + "epoch": 1.5624133148404993, + "grad_norm": 0.3345953434225004, + "learning_rate": 8.686591102985118e-06, + "loss": 0.412, + "step": 2253 + }, + { + "epoch": 1.5631067961165048, + "grad_norm": 0.3449520006433837, + "learning_rate": 8.684955373523787e-06, + "loss": 0.4837, + "step": 2254 + }, + { + "epoch": 1.5638002773925104, + "grad_norm": 0.3373259756591131, + "learning_rate": 8.683318780335604e-06, + "loss": 0.4888, + "step": 2255 + }, + { + "epoch": 1.564493758668516, + "grad_norm": 0.4129542149778871, + "learning_rate": 8.681681323804173e-06, + "loss": 0.4928, + "step": 2256 + }, + { + "epoch": 1.5651872399445215, + "grad_norm": 0.34042187219101755, + "learning_rate": 8.680043004313306e-06, + "loss": 0.502, + "step": 2257 + }, + { + "epoch": 1.565880721220527, + "grad_norm": 0.3631752829877917, + "learning_rate": 8.67840382224701e-06, + "loss": 0.5212, + "step": 2258 + }, + { + "epoch": 1.5665742024965326, + "grad_norm": 0.35915220713464724, + "learning_rate": 8.676763777989496e-06, + "loss": 0.4883, + "step": 2259 + }, + { + "epoch": 1.5672676837725381, + "grad_norm": 0.3395023662489134, + "learning_rate": 8.675122871925183e-06, + "loss": 0.4934, + "step": 2260 + }, + { + "epoch": 1.5679611650485437, + "grad_norm": 0.33741239069706436, + "learning_rate": 8.673481104438685e-06, + "loss": 0.5191, + "step": 2261 + }, + { + "epoch": 1.5686546463245492, + "grad_norm": 0.3194854552562877, + "learning_rate": 8.671838475914822e-06, + "loss": 0.4698, + "step": 2262 + }, + { + "epoch": 1.5693481276005548, + "grad_norm": 0.32188779556439207, + "learning_rate": 8.670194986738612e-06, + "loss": 0.4829, + "step": 2263 + }, + { + "epoch": 1.5700416088765603, + "grad_norm": 0.3721520248916597, + "learning_rate": 8.668550637295277e-06, + "loss": 0.522, + "step": 2264 + }, + { + "epoch": 1.5707350901525658, + "grad_norm": 0.35028826293968157, + "learning_rate": 8.666905427970243e-06, + "loss": 0.4901, + "step": 2265 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 0.3639130105475345, + "learning_rate": 8.665259359149132e-06, + "loss": 0.5087, + "step": 2266 + }, + { + "epoch": 1.572122052704577, + "grad_norm": 0.5190313296340657, + "learning_rate": 8.663612431217774e-06, + "loss": 0.5392, + "step": 2267 + }, + { + "epoch": 1.5728155339805825, + "grad_norm": 0.38849983750330797, + "learning_rate": 8.661964644562194e-06, + "loss": 0.5082, + "step": 2268 + }, + { + "epoch": 1.573509015256588, + "grad_norm": 0.3531878311721917, + "learning_rate": 8.660315999568623e-06, + "loss": 0.4826, + "step": 2269 + }, + { + "epoch": 1.5742024965325936, + "grad_norm": 0.3615359411346645, + "learning_rate": 8.658666496623492e-06, + "loss": 0.5247, + "step": 2270 + }, + { + "epoch": 1.5748959778085991, + "grad_norm": 0.3569792692889927, + "learning_rate": 8.65701613611343e-06, + "loss": 0.5474, + "step": 2271 + }, + { + "epoch": 1.5755894590846047, + "grad_norm": 0.33357778315429043, + "learning_rate": 8.65536491842527e-06, + "loss": 0.5327, + "step": 2272 + }, + { + "epoch": 1.5762829403606102, + "grad_norm": 0.3515711698917928, + "learning_rate": 8.653712843946048e-06, + "loss": 0.5045, + "step": 2273 + }, + { + "epoch": 1.5769764216366158, + "grad_norm": 0.3733973745796123, + "learning_rate": 8.652059913062998e-06, + "loss": 0.4955, + "step": 2274 + }, + { + "epoch": 1.5776699029126213, + "grad_norm": 0.5170113877554505, + "learning_rate": 8.650406126163553e-06, + "loss": 0.4766, + "step": 2275 + }, + { + "epoch": 1.5783633841886269, + "grad_norm": 0.3808574808748637, + "learning_rate": 8.648751483635349e-06, + "loss": 0.4612, + "step": 2276 + }, + { + "epoch": 1.5790568654646324, + "grad_norm": 0.33085709681247133, + "learning_rate": 8.647095985866222e-06, + "loss": 0.5142, + "step": 2277 + }, + { + "epoch": 1.579750346740638, + "grad_norm": 0.32359313611043633, + "learning_rate": 8.64543963324421e-06, + "loss": 0.4902, + "step": 2278 + }, + { + "epoch": 1.5804438280166435, + "grad_norm": 0.3420322606933061, + "learning_rate": 8.64378242615755e-06, + "loss": 0.4427, + "step": 2279 + }, + { + "epoch": 1.581137309292649, + "grad_norm": 0.34829402470440507, + "learning_rate": 8.642124364994678e-06, + "loss": 0.5111, + "step": 2280 + }, + { + "epoch": 1.5818307905686546, + "grad_norm": 0.37885662645245954, + "learning_rate": 8.640465450144232e-06, + "loss": 0.5464, + "step": 2281 + }, + { + "epoch": 1.5825242718446602, + "grad_norm": 0.35659453895805726, + "learning_rate": 8.638805681995052e-06, + "loss": 0.515, + "step": 2282 + }, + { + "epoch": 1.5832177531206657, + "grad_norm": 0.35062405710868483, + "learning_rate": 8.637145060936172e-06, + "loss": 0.5005, + "step": 2283 + }, + { + "epoch": 1.5839112343966713, + "grad_norm": 0.3742523026598048, + "learning_rate": 8.635483587356833e-06, + "loss": 0.5186, + "step": 2284 + }, + { + "epoch": 1.5846047156726768, + "grad_norm": 0.32541801716115343, + "learning_rate": 8.63382126164647e-06, + "loss": 0.5098, + "step": 2285 + }, + { + "epoch": 1.5852981969486823, + "grad_norm": 0.3464661377434391, + "learning_rate": 8.632158084194718e-06, + "loss": 0.4817, + "step": 2286 + }, + { + "epoch": 1.585991678224688, + "grad_norm": 0.33903649571410066, + "learning_rate": 8.630494055391418e-06, + "loss": 0.4999, + "step": 2287 + }, + { + "epoch": 1.5866851595006934, + "grad_norm": 0.35932718031968586, + "learning_rate": 8.628829175626605e-06, + "loss": 0.4996, + "step": 2288 + }, + { + "epoch": 1.587378640776699, + "grad_norm": 0.35072726408323124, + "learning_rate": 8.627163445290514e-06, + "loss": 0.4932, + "step": 2289 + }, + { + "epoch": 1.5880721220527045, + "grad_norm": 0.33516518196198114, + "learning_rate": 8.625496864773581e-06, + "loss": 0.4947, + "step": 2290 + }, + { + "epoch": 1.58876560332871, + "grad_norm": 0.3206029389144817, + "learning_rate": 8.62382943446644e-06, + "loss": 0.4845, + "step": 2291 + }, + { + "epoch": 1.5894590846047156, + "grad_norm": 0.3519214815637636, + "learning_rate": 8.622161154759925e-06, + "loss": 0.5381, + "step": 2292 + }, + { + "epoch": 1.5901525658807212, + "grad_norm": 0.3407678357986744, + "learning_rate": 8.620492026045067e-06, + "loss": 0.4865, + "step": 2293 + }, + { + "epoch": 1.5908460471567267, + "grad_norm": 0.34776026433527907, + "learning_rate": 8.6188220487131e-06, + "loss": 0.4531, + "step": 2294 + }, + { + "epoch": 1.5915395284327323, + "grad_norm": 0.4015731021920983, + "learning_rate": 8.617151223155453e-06, + "loss": 0.5543, + "step": 2295 + }, + { + "epoch": 1.5922330097087378, + "grad_norm": 0.3122812610509576, + "learning_rate": 8.615479549763756e-06, + "loss": 0.5013, + "step": 2296 + }, + { + "epoch": 1.5929264909847434, + "grad_norm": 0.6181767435580547, + "learning_rate": 8.613807028929837e-06, + "loss": 0.4736, + "step": 2297 + }, + { + "epoch": 1.593619972260749, + "grad_norm": 0.35460464085136595, + "learning_rate": 8.612133661045724e-06, + "loss": 0.5251, + "step": 2298 + }, + { + "epoch": 1.5943134535367545, + "grad_norm": 0.34417225984133837, + "learning_rate": 8.610459446503641e-06, + "loss": 0.5461, + "step": 2299 + }, + { + "epoch": 1.59500693481276, + "grad_norm": 0.3376208103562316, + "learning_rate": 8.60878438569601e-06, + "loss": 0.5495, + "step": 2300 + }, + { + "epoch": 1.5957004160887656, + "grad_norm": 0.31049008765757147, + "learning_rate": 8.607108479015456e-06, + "loss": 0.4468, + "step": 2301 + }, + { + "epoch": 1.596393897364771, + "grad_norm": 0.32866297624409146, + "learning_rate": 8.605431726854798e-06, + "loss": 0.4943, + "step": 2302 + }, + { + "epoch": 1.5970873786407767, + "grad_norm": 0.4131569653907188, + "learning_rate": 8.603754129607055e-06, + "loss": 0.5033, + "step": 2303 + }, + { + "epoch": 1.5977808599167822, + "grad_norm": 0.3939326141839437, + "learning_rate": 8.602075687665445e-06, + "loss": 0.5686, + "step": 2304 + }, + { + "epoch": 1.5984743411927878, + "grad_norm": 0.3686498367717291, + "learning_rate": 8.600396401423382e-06, + "loss": 0.5072, + "step": 2305 + }, + { + "epoch": 1.5991678224687933, + "grad_norm": 0.32625710778831524, + "learning_rate": 8.598716271274475e-06, + "loss": 0.4786, + "step": 2306 + }, + { + "epoch": 1.5998613037447988, + "grad_norm": 0.3222713337884579, + "learning_rate": 8.597035297612537e-06, + "loss": 0.4692, + "step": 2307 + }, + { + "epoch": 1.6005547850208044, + "grad_norm": 0.31254029594108695, + "learning_rate": 8.595353480831579e-06, + "loss": 0.468, + "step": 2308 + }, + { + "epoch": 1.60124826629681, + "grad_norm": 0.3706521656367522, + "learning_rate": 8.5936708213258e-06, + "loss": 0.5644, + "step": 2309 + }, + { + "epoch": 1.6019417475728155, + "grad_norm": 0.49448762398620105, + "learning_rate": 8.591987319489612e-06, + "loss": 0.5405, + "step": 2310 + }, + { + "epoch": 1.602635228848821, + "grad_norm": 0.30149327734099346, + "learning_rate": 8.590302975717608e-06, + "loss": 0.4343, + "step": 2311 + }, + { + "epoch": 1.6033287101248266, + "grad_norm": 0.3403371891151002, + "learning_rate": 8.58861779040459e-06, + "loss": 0.5056, + "step": 2312 + }, + { + "epoch": 1.6040221914008321, + "grad_norm": 0.3133703130401212, + "learning_rate": 8.58693176394555e-06, + "loss": 0.5165, + "step": 2313 + }, + { + "epoch": 1.6047156726768377, + "grad_norm": 0.4080524394945862, + "learning_rate": 8.585244896735683e-06, + "loss": 0.4862, + "step": 2314 + }, + { + "epoch": 1.6054091539528432, + "grad_norm": 0.32544398241823613, + "learning_rate": 8.583557189170378e-06, + "loss": 0.4556, + "step": 2315 + }, + { + "epoch": 1.6061026352288488, + "grad_norm": 0.4049591006774152, + "learning_rate": 8.58186864164522e-06, + "loss": 0.5284, + "step": 2316 + }, + { + "epoch": 1.6067961165048543, + "grad_norm": 0.32844390352811, + "learning_rate": 8.580179254555997e-06, + "loss": 0.471, + "step": 2317 + }, + { + "epoch": 1.6074895977808599, + "grad_norm": 0.3339450897007203, + "learning_rate": 8.578489028298682e-06, + "loss": 0.4949, + "step": 2318 + }, + { + "epoch": 1.6081830790568654, + "grad_norm": 0.316657542337206, + "learning_rate": 8.576797963269457e-06, + "loss": 0.4694, + "step": 2319 + }, + { + "epoch": 1.608876560332871, + "grad_norm": 0.34848180397483325, + "learning_rate": 8.575106059864692e-06, + "loss": 0.5174, + "step": 2320 + }, + { + "epoch": 1.6095700416088765, + "grad_norm": 0.34531692714722445, + "learning_rate": 8.573413318480962e-06, + "loss": 0.4592, + "step": 2321 + }, + { + "epoch": 1.610263522884882, + "grad_norm": 0.3297192718940854, + "learning_rate": 8.571719739515027e-06, + "loss": 0.517, + "step": 2322 + }, + { + "epoch": 1.6109570041608876, + "grad_norm": 0.5006368300085653, + "learning_rate": 8.570025323363853e-06, + "loss": 0.5746, + "step": 2323 + }, + { + "epoch": 1.6116504854368932, + "grad_norm": 0.3338139895127677, + "learning_rate": 8.5683300704246e-06, + "loss": 0.4648, + "step": 2324 + }, + { + "epoch": 1.6123439667128987, + "grad_norm": 0.3102976806507002, + "learning_rate": 8.566633981094621e-06, + "loss": 0.4877, + "step": 2325 + }, + { + "epoch": 1.6130374479889042, + "grad_norm": 0.3525751147871692, + "learning_rate": 8.564937055771468e-06, + "loss": 0.5632, + "step": 2326 + }, + { + "epoch": 1.6137309292649098, + "grad_norm": 0.33655687426200803, + "learning_rate": 8.563239294852885e-06, + "loss": 0.5249, + "step": 2327 + }, + { + "epoch": 1.6144244105409153, + "grad_norm": 0.3341375490350698, + "learning_rate": 8.561540698736821e-06, + "loss": 0.5165, + "step": 2328 + }, + { + "epoch": 1.615117891816921, + "grad_norm": 0.3647406383552219, + "learning_rate": 8.559841267821409e-06, + "loss": 0.5521, + "step": 2329 + }, + { + "epoch": 1.6158113730929264, + "grad_norm": 0.33799324385201407, + "learning_rate": 8.558141002504987e-06, + "loss": 0.5019, + "step": 2330 + }, + { + "epoch": 1.616504854368932, + "grad_norm": 0.31136255655238165, + "learning_rate": 8.556439903186082e-06, + "loss": 0.4336, + "step": 2331 + }, + { + "epoch": 1.6171983356449375, + "grad_norm": 0.3331417838237942, + "learning_rate": 8.55473797026342e-06, + "loss": 0.4883, + "step": 2332 + }, + { + "epoch": 1.617891816920943, + "grad_norm": 0.3077342605630263, + "learning_rate": 8.553035204135925e-06, + "loss": 0.4584, + "step": 2333 + }, + { + "epoch": 1.6185852981969486, + "grad_norm": 0.31512207029311223, + "learning_rate": 8.551331605202708e-06, + "loss": 0.4595, + "step": 2334 + }, + { + "epoch": 1.6192787794729542, + "grad_norm": 0.3100505336704164, + "learning_rate": 8.549627173863085e-06, + "loss": 0.4833, + "step": 2335 + }, + { + "epoch": 1.6199722607489597, + "grad_norm": 0.3348803822961055, + "learning_rate": 8.547921910516556e-06, + "loss": 0.4647, + "step": 2336 + }, + { + "epoch": 1.6206657420249653, + "grad_norm": 0.35412026804913915, + "learning_rate": 8.546215815562831e-06, + "loss": 0.456, + "step": 2337 + }, + { + "epoch": 1.6213592233009708, + "grad_norm": 0.3373130549490593, + "learning_rate": 8.544508889401799e-06, + "loss": 0.4551, + "step": 2338 + }, + { + "epoch": 1.6220527045769764, + "grad_norm": 0.3416280756280188, + "learning_rate": 8.542801132433554e-06, + "loss": 0.5362, + "step": 2339 + }, + { + "epoch": 1.622746185852982, + "grad_norm": 0.3547589986333213, + "learning_rate": 8.541092545058383e-06, + "loss": 0.5314, + "step": 2340 + }, + { + "epoch": 1.6234396671289875, + "grad_norm": 0.34010660436726803, + "learning_rate": 8.539383127676764e-06, + "loss": 0.5294, + "step": 2341 + }, + { + "epoch": 1.624133148404993, + "grad_norm": 0.33219758876713085, + "learning_rate": 8.537672880689374e-06, + "loss": 0.4964, + "step": 2342 + }, + { + "epoch": 1.6248266296809986, + "grad_norm": 0.3381789141756323, + "learning_rate": 8.535961804497081e-06, + "loss": 0.4363, + "step": 2343 + }, + { + "epoch": 1.6255201109570043, + "grad_norm": 0.3421398607098175, + "learning_rate": 8.53424989950095e-06, + "loss": 0.5314, + "step": 2344 + }, + { + "epoch": 1.6262135922330097, + "grad_norm": 0.3926612390608668, + "learning_rate": 8.53253716610224e-06, + "loss": 0.5333, + "step": 2345 + }, + { + "epoch": 1.6269070735090154, + "grad_norm": 0.31329313173084145, + "learning_rate": 8.530823604702402e-06, + "loss": 0.4712, + "step": 2346 + }, + { + "epoch": 1.6276005547850207, + "grad_norm": 0.32639142826056655, + "learning_rate": 8.529109215703082e-06, + "loss": 0.4718, + "step": 2347 + }, + { + "epoch": 1.6282940360610265, + "grad_norm": 0.35821919319031414, + "learning_rate": 8.52739399950612e-06, + "loss": 0.4757, + "step": 2348 + }, + { + "epoch": 1.6289875173370318, + "grad_norm": 0.3585954356440912, + "learning_rate": 8.525677956513552e-06, + "loss": 0.4417, + "step": 2349 + }, + { + "epoch": 1.6296809986130376, + "grad_norm": 0.3688836681015824, + "learning_rate": 8.523961087127605e-06, + "loss": 0.532, + "step": 2350 + }, + { + "epoch": 1.630374479889043, + "grad_norm": 0.3670613238165888, + "learning_rate": 8.522243391750699e-06, + "loss": 0.4667, + "step": 2351 + }, + { + "epoch": 1.6310679611650487, + "grad_norm": 0.34101555271089323, + "learning_rate": 8.520524870785453e-06, + "loss": 0.5179, + "step": 2352 + }, + { + "epoch": 1.631761442441054, + "grad_norm": 0.35111151709950245, + "learning_rate": 8.518805524634675e-06, + "loss": 0.5117, + "step": 2353 + }, + { + "epoch": 1.6324549237170598, + "grad_norm": 0.3317940926638129, + "learning_rate": 8.517085353701366e-06, + "loss": 0.5048, + "step": 2354 + }, + { + "epoch": 1.6331484049930651, + "grad_norm": 0.37414614984125183, + "learning_rate": 8.515364358388722e-06, + "loss": 0.5789, + "step": 2355 + }, + { + "epoch": 1.633841886269071, + "grad_norm": 0.3376622621389358, + "learning_rate": 8.51364253910013e-06, + "loss": 0.4428, + "step": 2356 + }, + { + "epoch": 1.6345353675450762, + "grad_norm": 0.34426885902052357, + "learning_rate": 8.511919896239176e-06, + "loss": 0.4405, + "step": 2357 + }, + { + "epoch": 1.635228848821082, + "grad_norm": 0.3847237659073976, + "learning_rate": 8.510196430209632e-06, + "loss": 0.4971, + "step": 2358 + }, + { + "epoch": 1.6359223300970873, + "grad_norm": 0.37985385094614077, + "learning_rate": 8.508472141415468e-06, + "loss": 0.5116, + "step": 2359 + }, + { + "epoch": 1.636615811373093, + "grad_norm": 0.31211572860115416, + "learning_rate": 8.506747030260841e-06, + "loss": 0.4679, + "step": 2360 + }, + { + "epoch": 1.6373092926490984, + "grad_norm": 0.32726561785027947, + "learning_rate": 8.505021097150108e-06, + "loss": 0.557, + "step": 2361 + }, + { + "epoch": 1.6380027739251042, + "grad_norm": 0.35681214550346435, + "learning_rate": 8.503294342487815e-06, + "loss": 0.4792, + "step": 2362 + }, + { + "epoch": 1.6386962552011095, + "grad_norm": 0.3492848909148922, + "learning_rate": 8.501566766678701e-06, + "loss": 0.5307, + "step": 2363 + }, + { + "epoch": 1.6393897364771153, + "grad_norm": 0.31637614213127896, + "learning_rate": 8.499838370127696e-06, + "loss": 0.4835, + "step": 2364 + }, + { + "epoch": 1.6400832177531206, + "grad_norm": 0.33730014438008116, + "learning_rate": 8.498109153239924e-06, + "loss": 0.5135, + "step": 2365 + }, + { + "epoch": 1.6407766990291264, + "grad_norm": 0.40935874263822575, + "learning_rate": 8.4963791164207e-06, + "loss": 0.5421, + "step": 2366 + }, + { + "epoch": 1.6414701803051317, + "grad_norm": 0.3379064073476399, + "learning_rate": 8.494648260075533e-06, + "loss": 0.523, + "step": 2367 + }, + { + "epoch": 1.6421636615811375, + "grad_norm": 0.31702626560645264, + "learning_rate": 8.492916584610124e-06, + "loss": 0.419, + "step": 2368 + }, + { + "epoch": 1.6428571428571428, + "grad_norm": 0.3601047079642492, + "learning_rate": 8.491184090430365e-06, + "loss": 0.5253, + "step": 2369 + }, + { + "epoch": 1.6435506241331486, + "grad_norm": 0.3382148672494078, + "learning_rate": 8.489450777942339e-06, + "loss": 0.4794, + "step": 2370 + }, + { + "epoch": 1.6442441054091539, + "grad_norm": 0.38147039535952726, + "learning_rate": 8.487716647552321e-06, + "loss": 0.5426, + "step": 2371 + }, + { + "epoch": 1.6449375866851597, + "grad_norm": 0.39262805949278906, + "learning_rate": 8.485981699666783e-06, + "loss": 0.6066, + "step": 2372 + }, + { + "epoch": 1.645631067961165, + "grad_norm": 0.3311634228969915, + "learning_rate": 8.484245934692379e-06, + "loss": 0.4823, + "step": 2373 + }, + { + "epoch": 1.6463245492371708, + "grad_norm": 0.3094441739636701, + "learning_rate": 8.482509353035963e-06, + "loss": 0.4549, + "step": 2374 + }, + { + "epoch": 1.647018030513176, + "grad_norm": 0.3451938236517863, + "learning_rate": 8.480771955104576e-06, + "loss": 0.5351, + "step": 2375 + }, + { + "epoch": 1.6477115117891818, + "grad_norm": 0.3914481969080076, + "learning_rate": 8.479033741305451e-06, + "loss": 0.5234, + "step": 2376 + }, + { + "epoch": 1.6484049930651872, + "grad_norm": 0.35019362739476884, + "learning_rate": 8.477294712046015e-06, + "loss": 0.4876, + "step": 2377 + }, + { + "epoch": 1.649098474341193, + "grad_norm": 0.32659517998467763, + "learning_rate": 8.47555486773388e-06, + "loss": 0.4765, + "step": 2378 + }, + { + "epoch": 1.6497919556171983, + "grad_norm": 0.3544349859107952, + "learning_rate": 8.473814208776859e-06, + "loss": 0.5009, + "step": 2379 + }, + { + "epoch": 1.650485436893204, + "grad_norm": 0.33145226650270854, + "learning_rate": 8.472072735582942e-06, + "loss": 0.4908, + "step": 2380 + }, + { + "epoch": 1.6511789181692094, + "grad_norm": 0.36111427694300746, + "learning_rate": 8.470330448560322e-06, + "loss": 0.5376, + "step": 2381 + }, + { + "epoch": 1.6518723994452151, + "grad_norm": 0.34250434203443375, + "learning_rate": 8.46858734811738e-06, + "loss": 0.5121, + "step": 2382 + }, + { + "epoch": 1.6525658807212205, + "grad_norm": 0.3386582342923932, + "learning_rate": 8.466843434662684e-06, + "loss": 0.459, + "step": 2383 + }, + { + "epoch": 1.6532593619972262, + "grad_norm": 0.30438521537428603, + "learning_rate": 8.465098708604993e-06, + "loss": 0.4244, + "step": 2384 + }, + { + "epoch": 1.6539528432732316, + "grad_norm": 0.35913260197182756, + "learning_rate": 8.463353170353263e-06, + "loss": 0.4943, + "step": 2385 + }, + { + "epoch": 1.6546463245492373, + "grad_norm": 0.34759450587753227, + "learning_rate": 8.46160682031663e-06, + "loss": 0.5221, + "step": 2386 + }, + { + "epoch": 1.6553398058252426, + "grad_norm": 0.44793266660351866, + "learning_rate": 8.45985965890443e-06, + "loss": 0.5094, + "step": 2387 + }, + { + "epoch": 1.6560332871012484, + "grad_norm": 0.3362340108452135, + "learning_rate": 8.458111686526183e-06, + "loss": 0.4799, + "step": 2388 + }, + { + "epoch": 1.6567267683772537, + "grad_norm": 0.3623547035990375, + "learning_rate": 8.456362903591602e-06, + "loss": 0.5116, + "step": 2389 + }, + { + "epoch": 1.6574202496532595, + "grad_norm": 0.3347272319748318, + "learning_rate": 8.454613310510589e-06, + "loss": 0.5074, + "step": 2390 + }, + { + "epoch": 1.6581137309292648, + "grad_norm": 0.3419824071284227, + "learning_rate": 8.452862907693233e-06, + "loss": 0.5128, + "step": 2391 + }, + { + "epoch": 1.6588072122052706, + "grad_norm": 0.35485842251470634, + "learning_rate": 8.45111169554982e-06, + "loss": 0.5367, + "step": 2392 + }, + { + "epoch": 1.659500693481276, + "grad_norm": 0.37391988512056223, + "learning_rate": 8.44935967449082e-06, + "loss": 0.4835, + "step": 2393 + }, + { + "epoch": 1.6601941747572817, + "grad_norm": 0.3673986650933405, + "learning_rate": 8.447606844926895e-06, + "loss": 0.5052, + "step": 2394 + }, + { + "epoch": 1.660887656033287, + "grad_norm": 0.3881693520995384, + "learning_rate": 8.44585320726889e-06, + "loss": 0.4922, + "step": 2395 + }, + { + "epoch": 1.6615811373092928, + "grad_norm": 0.325655165536149, + "learning_rate": 8.444098761927855e-06, + "loss": 0.4583, + "step": 2396 + }, + { + "epoch": 1.6622746185852981, + "grad_norm": 0.3846994429297036, + "learning_rate": 8.44234350931501e-06, + "loss": 0.497, + "step": 2397 + }, + { + "epoch": 1.662968099861304, + "grad_norm": 0.4501128496829653, + "learning_rate": 8.440587449841778e-06, + "loss": 0.5311, + "step": 2398 + }, + { + "epoch": 1.6636615811373092, + "grad_norm": 0.3391140575840351, + "learning_rate": 8.438830583919764e-06, + "loss": 0.5657, + "step": 2399 + }, + { + "epoch": 1.664355062413315, + "grad_norm": 0.3276414395723224, + "learning_rate": 8.437072911960768e-06, + "loss": 0.5109, + "step": 2400 + }, + { + "epoch": 1.6650485436893203, + "grad_norm": 0.3186521534762923, + "learning_rate": 8.435314434376773e-06, + "loss": 0.4476, + "step": 2401 + }, + { + "epoch": 1.665742024965326, + "grad_norm": 0.3258387241321177, + "learning_rate": 8.433555151579955e-06, + "loss": 0.4531, + "step": 2402 + }, + { + "epoch": 1.6664355062413314, + "grad_norm": 0.3141937433091077, + "learning_rate": 8.431795063982676e-06, + "loss": 0.5081, + "step": 2403 + }, + { + "epoch": 1.6671289875173372, + "grad_norm": 0.3533433800951231, + "learning_rate": 8.430034171997487e-06, + "loss": 0.537, + "step": 2404 + }, + { + "epoch": 1.6678224687933425, + "grad_norm": 0.3785611905897186, + "learning_rate": 8.428272476037131e-06, + "loss": 0.4925, + "step": 2405 + }, + { + "epoch": 1.6685159500693483, + "grad_norm": 0.3451032533804043, + "learning_rate": 8.426509976514535e-06, + "loss": 0.5093, + "step": 2406 + }, + { + "epoch": 1.6692094313453536, + "grad_norm": 0.3983835234416269, + "learning_rate": 8.424746673842817e-06, + "loss": 0.5818, + "step": 2407 + }, + { + "epoch": 1.6699029126213594, + "grad_norm": 0.35946256960241124, + "learning_rate": 8.422982568435283e-06, + "loss": 0.5147, + "step": 2408 + }, + { + "epoch": 1.6705963938973647, + "grad_norm": 0.33512022199958486, + "learning_rate": 8.421217660705423e-06, + "loss": 0.5014, + "step": 2409 + }, + { + "epoch": 1.6712898751733705, + "grad_norm": 0.3279723714645418, + "learning_rate": 8.419451951066922e-06, + "loss": 0.4734, + "step": 2410 + }, + { + "epoch": 1.6719833564493758, + "grad_norm": 0.3113632797827704, + "learning_rate": 8.417685439933647e-06, + "loss": 0.4941, + "step": 2411 + }, + { + "epoch": 1.6726768377253816, + "grad_norm": 0.361208800565422, + "learning_rate": 8.415918127719659e-06, + "loss": 0.4964, + "step": 2412 + }, + { + "epoch": 1.6733703190013869, + "grad_norm": 0.35029005520106865, + "learning_rate": 8.4141500148392e-06, + "loss": 0.5474, + "step": 2413 + }, + { + "epoch": 1.6740638002773927, + "grad_norm": 0.38762583517759724, + "learning_rate": 8.412381101706706e-06, + "loss": 0.5499, + "step": 2414 + }, + { + "epoch": 1.674757281553398, + "grad_norm": 0.33443090207346654, + "learning_rate": 8.410611388736793e-06, + "loss": 0.4624, + "step": 2415 + }, + { + "epoch": 1.6754507628294038, + "grad_norm": 0.3218181688695065, + "learning_rate": 8.408840876344271e-06, + "loss": 0.4613, + "step": 2416 + }, + { + "epoch": 1.676144244105409, + "grad_norm": 0.33119099547396874, + "learning_rate": 8.407069564944136e-06, + "loss": 0.5194, + "step": 2417 + }, + { + "epoch": 1.6768377253814148, + "grad_norm": 0.33015613336472177, + "learning_rate": 8.405297454951571e-06, + "loss": 0.498, + "step": 2418 + }, + { + "epoch": 1.6775312066574202, + "grad_norm": 0.3452461370979553, + "learning_rate": 8.403524546781945e-06, + "loss": 0.4673, + "step": 2419 + }, + { + "epoch": 1.678224687933426, + "grad_norm": 0.3885633218107612, + "learning_rate": 8.401750840850814e-06, + "loss": 0.5108, + "step": 2420 + }, + { + "epoch": 1.6789181692094313, + "grad_norm": 0.38120231623458456, + "learning_rate": 8.399976337573922e-06, + "loss": 0.524, + "step": 2421 + }, + { + "epoch": 1.679611650485437, + "grad_norm": 0.35916656477788433, + "learning_rate": 8.398201037367202e-06, + "loss": 0.4893, + "step": 2422 + }, + { + "epoch": 1.6803051317614424, + "grad_norm": 0.32811593502155867, + "learning_rate": 8.39642494064677e-06, + "loss": 0.4315, + "step": 2423 + }, + { + "epoch": 1.6809986130374481, + "grad_norm": 0.3432195418816232, + "learning_rate": 8.394648047828929e-06, + "loss": 0.445, + "step": 2424 + }, + { + "epoch": 1.6816920943134535, + "grad_norm": 0.3450660030159922, + "learning_rate": 8.39287035933017e-06, + "loss": 0.4823, + "step": 2425 + }, + { + "epoch": 1.6823855755894592, + "grad_norm": 0.327998520064132, + "learning_rate": 8.391091875567172e-06, + "loss": 0.457, + "step": 2426 + }, + { + "epoch": 1.6830790568654646, + "grad_norm": 0.3214512239859119, + "learning_rate": 8.389312596956797e-06, + "loss": 0.4285, + "step": 2427 + }, + { + "epoch": 1.6837725381414703, + "grad_norm": 0.3746752871576561, + "learning_rate": 8.387532523916097e-06, + "loss": 0.5089, + "step": 2428 + }, + { + "epoch": 1.6844660194174756, + "grad_norm": 0.3344666780359997, + "learning_rate": 8.385751656862305e-06, + "loss": 0.4917, + "step": 2429 + }, + { + "epoch": 1.6851595006934814, + "grad_norm": 0.3090978619731297, + "learning_rate": 8.383969996212847e-06, + "loss": 0.5079, + "step": 2430 + }, + { + "epoch": 1.6858529819694867, + "grad_norm": 0.3500562402914821, + "learning_rate": 8.382187542385329e-06, + "loss": 0.5758, + "step": 2431 + }, + { + "epoch": 1.6865464632454925, + "grad_norm": 0.3024623056590533, + "learning_rate": 8.380404295797549e-06, + "loss": 0.457, + "step": 2432 + }, + { + "epoch": 1.6872399445214978, + "grad_norm": 0.3569864578946277, + "learning_rate": 8.37862025686748e-06, + "loss": 0.4905, + "step": 2433 + }, + { + "epoch": 1.6879334257975036, + "grad_norm": 0.37597910967736453, + "learning_rate": 8.376835426013293e-06, + "loss": 0.5645, + "step": 2434 + }, + { + "epoch": 1.688626907073509, + "grad_norm": 0.3215998184985862, + "learning_rate": 8.375049803653338e-06, + "loss": 0.4702, + "step": 2435 + }, + { + "epoch": 1.6893203883495147, + "grad_norm": 0.3304465412730977, + "learning_rate": 8.373263390206155e-06, + "loss": 0.5013, + "step": 2436 + }, + { + "epoch": 1.69001386962552, + "grad_norm": 0.3362356458388988, + "learning_rate": 8.37147618609046e-06, + "loss": 0.5372, + "step": 2437 + }, + { + "epoch": 1.6907073509015258, + "grad_norm": 0.3316397757105315, + "learning_rate": 8.369688191725167e-06, + "loss": 0.4841, + "step": 2438 + }, + { + "epoch": 1.6914008321775311, + "grad_norm": 0.3436839283442423, + "learning_rate": 8.367899407529366e-06, + "loss": 0.4775, + "step": 2439 + }, + { + "epoch": 1.692094313453537, + "grad_norm": 0.3530057847109228, + "learning_rate": 8.366109833922335e-06, + "loss": 0.5103, + "step": 2440 + }, + { + "epoch": 1.6927877947295422, + "grad_norm": 0.38237495469363436, + "learning_rate": 8.364319471323537e-06, + "loss": 0.5424, + "step": 2441 + }, + { + "epoch": 1.693481276005548, + "grad_norm": 0.3323218181024924, + "learning_rate": 8.362528320152621e-06, + "loss": 0.5073, + "step": 2442 + }, + { + "epoch": 1.6941747572815533, + "grad_norm": 0.3395087230743487, + "learning_rate": 8.36073638082942e-06, + "loss": 0.4976, + "step": 2443 + }, + { + "epoch": 1.694868238557559, + "grad_norm": 0.43803096709012057, + "learning_rate": 8.35894365377395e-06, + "loss": 0.5614, + "step": 2444 + }, + { + "epoch": 1.6955617198335644, + "grad_norm": 0.4092037821750451, + "learning_rate": 8.357150139406414e-06, + "loss": 0.5346, + "step": 2445 + }, + { + "epoch": 1.6962552011095702, + "grad_norm": 0.36379433307402875, + "learning_rate": 8.355355838147199e-06, + "loss": 0.482, + "step": 2446 + }, + { + "epoch": 1.6969486823855755, + "grad_norm": 0.33761524734315884, + "learning_rate": 8.353560750416876e-06, + "loss": 0.5029, + "step": 2447 + }, + { + "epoch": 1.6976421636615813, + "grad_norm": 0.3200023027999064, + "learning_rate": 8.351764876636202e-06, + "loss": 0.4316, + "step": 2448 + }, + { + "epoch": 1.6983356449375866, + "grad_norm": 0.35372822392811026, + "learning_rate": 8.349968217226114e-06, + "loss": 0.5526, + "step": 2449 + }, + { + "epoch": 1.6990291262135924, + "grad_norm": 0.33553182120293173, + "learning_rate": 8.348170772607737e-06, + "loss": 0.5162, + "step": 2450 + }, + { + "epoch": 1.6997226074895977, + "grad_norm": 0.3479254809150717, + "learning_rate": 8.346372543202382e-06, + "loss": 0.5343, + "step": 2451 + }, + { + "epoch": 1.7004160887656035, + "grad_norm": 0.5812679854894354, + "learning_rate": 8.344573529431536e-06, + "loss": 0.4798, + "step": 2452 + }, + { + "epoch": 1.7011095700416088, + "grad_norm": 0.3674849404115623, + "learning_rate": 8.342773731716878e-06, + "loss": 0.5301, + "step": 2453 + }, + { + "epoch": 1.7018030513176146, + "grad_norm": 0.37233526724345034, + "learning_rate": 8.340973150480266e-06, + "loss": 0.5202, + "step": 2454 + }, + { + "epoch": 1.7024965325936199, + "grad_norm": 0.35593068200025635, + "learning_rate": 8.339171786143747e-06, + "loss": 0.5084, + "step": 2455 + }, + { + "epoch": 1.7031900138696257, + "grad_norm": 0.3230195542324644, + "learning_rate": 8.337369639129541e-06, + "loss": 0.4771, + "step": 2456 + }, + { + "epoch": 1.703883495145631, + "grad_norm": 0.3599876212209188, + "learning_rate": 8.335566709860065e-06, + "loss": 0.5087, + "step": 2457 + }, + { + "epoch": 1.7045769764216367, + "grad_norm": 0.2976928682742035, + "learning_rate": 8.333762998757908e-06, + "loss": 0.4271, + "step": 2458 + }, + { + "epoch": 1.705270457697642, + "grad_norm": 0.45662886179373235, + "learning_rate": 8.331958506245849e-06, + "loss": 0.5037, + "step": 2459 + }, + { + "epoch": 1.7059639389736478, + "grad_norm": 0.6422844526708903, + "learning_rate": 8.330153232746846e-06, + "loss": 0.4767, + "step": 2460 + }, + { + "epoch": 1.7066574202496532, + "grad_norm": 0.38898480654620865, + "learning_rate": 8.328347178684045e-06, + "loss": 0.5264, + "step": 2461 + }, + { + "epoch": 1.707350901525659, + "grad_norm": 0.3540969996439805, + "learning_rate": 8.32654034448077e-06, + "loss": 0.5359, + "step": 2462 + }, + { + "epoch": 1.7080443828016643, + "grad_norm": 0.3505125568171822, + "learning_rate": 8.32473273056053e-06, + "loss": 0.4538, + "step": 2463 + }, + { + "epoch": 1.70873786407767, + "grad_norm": 0.3730192545013024, + "learning_rate": 8.322924337347016e-06, + "loss": 0.5113, + "step": 2464 + }, + { + "epoch": 1.7094313453536754, + "grad_norm": 0.3437784529450237, + "learning_rate": 8.321115165264102e-06, + "loss": 0.4481, + "step": 2465 + }, + { + "epoch": 1.7101248266296811, + "grad_norm": 0.3500718730594351, + "learning_rate": 8.31930521473585e-06, + "loss": 0.5189, + "step": 2466 + }, + { + "epoch": 1.7108183079056865, + "grad_norm": 1.3020713277067153, + "learning_rate": 8.31749448618649e-06, + "loss": 0.4947, + "step": 2467 + }, + { + "epoch": 1.7115117891816922, + "grad_norm": 0.38730326596472725, + "learning_rate": 8.315682980040454e-06, + "loss": 0.5072, + "step": 2468 + }, + { + "epoch": 1.7122052704576975, + "grad_norm": 0.3385802479454782, + "learning_rate": 8.313870696722338e-06, + "loss": 0.4652, + "step": 2469 + }, + { + "epoch": 1.7128987517337033, + "grad_norm": 0.31967293601897206, + "learning_rate": 8.31205763665693e-06, + "loss": 0.4788, + "step": 2470 + }, + { + "epoch": 1.7135922330097086, + "grad_norm": 0.3908487619420314, + "learning_rate": 8.3102438002692e-06, + "loss": 0.4675, + "step": 2471 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.40637026388607694, + "learning_rate": 8.308429187984298e-06, + "loss": 0.5062, + "step": 2472 + }, + { + "epoch": 1.7149791955617197, + "grad_norm": 0.3542652823210553, + "learning_rate": 8.306613800227555e-06, + "loss": 0.5213, + "step": 2473 + }, + { + "epoch": 1.7156726768377255, + "grad_norm": 0.32988776133636527, + "learning_rate": 8.304797637424484e-06, + "loss": 0.5028, + "step": 2474 + }, + { + "epoch": 1.7163661581137308, + "grad_norm": 0.36309291428325613, + "learning_rate": 8.30298070000078e-06, + "loss": 0.4874, + "step": 2475 + }, + { + "epoch": 1.7170596393897366, + "grad_norm": 0.353391374842643, + "learning_rate": 8.301162988382325e-06, + "loss": 0.5011, + "step": 2476 + }, + { + "epoch": 1.717753120665742, + "grad_norm": 0.3212144882534916, + "learning_rate": 8.29934450299517e-06, + "loss": 0.5247, + "step": 2477 + }, + { + "epoch": 1.7184466019417477, + "grad_norm": 0.3444249238312598, + "learning_rate": 8.29752524426556e-06, + "loss": 0.5182, + "step": 2478 + }, + { + "epoch": 1.719140083217753, + "grad_norm": 0.3675114739167973, + "learning_rate": 8.295705212619916e-06, + "loss": 0.5026, + "step": 2479 + }, + { + "epoch": 1.7198335644937588, + "grad_norm": 0.36757609168659333, + "learning_rate": 8.293884408484835e-06, + "loss": 0.5189, + "step": 2480 + }, + { + "epoch": 1.7205270457697641, + "grad_norm": 0.3473402414865221, + "learning_rate": 8.292062832287107e-06, + "loss": 0.5004, + "step": 2481 + }, + { + "epoch": 1.7212205270457699, + "grad_norm": 0.3248994512518578, + "learning_rate": 8.290240484453693e-06, + "loss": 0.4429, + "step": 2482 + }, + { + "epoch": 1.7219140083217752, + "grad_norm": 0.3754054166117554, + "learning_rate": 8.288417365411738e-06, + "loss": 0.478, + "step": 2483 + }, + { + "epoch": 1.722607489597781, + "grad_norm": 0.4546400086310538, + "learning_rate": 8.28659347558857e-06, + "loss": 0.4843, + "step": 2484 + }, + { + "epoch": 1.7233009708737863, + "grad_norm": 0.3333067144791095, + "learning_rate": 8.284768815411693e-06, + "loss": 0.4708, + "step": 2485 + }, + { + "epoch": 1.723994452149792, + "grad_norm": 0.3707418743071705, + "learning_rate": 8.282943385308794e-06, + "loss": 0.4993, + "step": 2486 + }, + { + "epoch": 1.7246879334257974, + "grad_norm": 0.3320962453755535, + "learning_rate": 8.281117185707741e-06, + "loss": 0.5005, + "step": 2487 + }, + { + "epoch": 1.7253814147018032, + "grad_norm": 0.34239302367868774, + "learning_rate": 8.279290217036583e-06, + "loss": 0.5262, + "step": 2488 + }, + { + "epoch": 1.7260748959778085, + "grad_norm": 0.34487267720362547, + "learning_rate": 8.27746247972355e-06, + "loss": 0.4887, + "step": 2489 + }, + { + "epoch": 1.7267683772538143, + "grad_norm": 0.3521400798206825, + "learning_rate": 8.275633974197048e-06, + "loss": 0.4905, + "step": 2490 + }, + { + "epoch": 1.7274618585298196, + "grad_norm": 0.3437904733859773, + "learning_rate": 8.273804700885664e-06, + "loss": 0.4866, + "step": 2491 + }, + { + "epoch": 1.7281553398058254, + "grad_norm": 0.326765053662226, + "learning_rate": 8.27197466021817e-06, + "loss": 0.4424, + "step": 2492 + }, + { + "epoch": 1.7288488210818307, + "grad_norm": 0.33938505275484304, + "learning_rate": 8.27014385262351e-06, + "loss": 0.5245, + "step": 2493 + }, + { + "epoch": 1.7295423023578365, + "grad_norm": 0.3709333437985796, + "learning_rate": 8.268312278530816e-06, + "loss": 0.4937, + "step": 2494 + }, + { + "epoch": 1.7302357836338418, + "grad_norm": 0.3648784958531084, + "learning_rate": 8.266479938369395e-06, + "loss": 0.4699, + "step": 2495 + }, + { + "epoch": 1.7309292649098476, + "grad_norm": 0.38184892930452446, + "learning_rate": 8.26464683256873e-06, + "loss": 0.5272, + "step": 2496 + }, + { + "epoch": 1.7316227461858529, + "grad_norm": 0.31143739047605695, + "learning_rate": 8.262812961558494e-06, + "loss": 0.4721, + "step": 2497 + }, + { + "epoch": 1.7323162274618586, + "grad_norm": 0.36252154762018163, + "learning_rate": 8.26097832576853e-06, + "loss": 0.5058, + "step": 2498 + }, + { + "epoch": 1.733009708737864, + "grad_norm": 0.34851031406026894, + "learning_rate": 8.259142925628862e-06, + "loss": 0.491, + "step": 2499 + }, + { + "epoch": 1.7337031900138697, + "grad_norm": 0.4156842337734494, + "learning_rate": 8.257306761569697e-06, + "loss": 0.4562, + "step": 2500 + }, + { + "epoch": 1.734396671289875, + "grad_norm": 0.34043316498210197, + "learning_rate": 8.25546983402142e-06, + "loss": 0.4957, + "step": 2501 + }, + { + "epoch": 1.7350901525658808, + "grad_norm": 0.3797393027097939, + "learning_rate": 8.253632143414586e-06, + "loss": 0.564, + "step": 2502 + }, + { + "epoch": 1.7357836338418862, + "grad_norm": 0.33700527540918146, + "learning_rate": 8.251793690179947e-06, + "loss": 0.5597, + "step": 2503 + }, + { + "epoch": 1.736477115117892, + "grad_norm": 0.3690079523704426, + "learning_rate": 8.249954474748414e-06, + "loss": 0.5116, + "step": 2504 + }, + { + "epoch": 1.7371705963938973, + "grad_norm": 0.314775969337088, + "learning_rate": 8.24811449755109e-06, + "loss": 0.4644, + "step": 2505 + }, + { + "epoch": 1.737864077669903, + "grad_norm": 0.34626876034402704, + "learning_rate": 8.246273759019252e-06, + "loss": 0.5306, + "step": 2506 + }, + { + "epoch": 1.7385575589459084, + "grad_norm": 0.34914103867214885, + "learning_rate": 8.244432259584356e-06, + "loss": 0.4415, + "step": 2507 + }, + { + "epoch": 1.7392510402219141, + "grad_norm": 0.3085317865880849, + "learning_rate": 8.242589999678037e-06, + "loss": 0.4894, + "step": 2508 + }, + { + "epoch": 1.7399445214979194, + "grad_norm": 0.32595498921763916, + "learning_rate": 8.240746979732103e-06, + "loss": 0.494, + "step": 2509 + }, + { + "epoch": 1.7406380027739252, + "grad_norm": 0.40704577999226504, + "learning_rate": 8.23890320017855e-06, + "loss": 0.5072, + "step": 2510 + }, + { + "epoch": 1.7413314840499305, + "grad_norm": 0.37428305256480154, + "learning_rate": 8.237058661449543e-06, + "loss": 0.5011, + "step": 2511 + }, + { + "epoch": 1.7420249653259363, + "grad_norm": 0.37528891685475274, + "learning_rate": 8.23521336397743e-06, + "loss": 0.5164, + "step": 2512 + }, + { + "epoch": 1.7427184466019416, + "grad_norm": 0.32956195069890537, + "learning_rate": 8.233367308194735e-06, + "loss": 0.4872, + "step": 2513 + }, + { + "epoch": 1.7434119278779474, + "grad_norm": 0.3180733197569015, + "learning_rate": 8.231520494534158e-06, + "loss": 0.4476, + "step": 2514 + }, + { + "epoch": 1.7441054091539527, + "grad_norm": 0.34602940917357783, + "learning_rate": 8.229672923428582e-06, + "loss": 0.4924, + "step": 2515 + }, + { + "epoch": 1.7447988904299585, + "grad_norm": 0.3370268041212885, + "learning_rate": 8.227824595311064e-06, + "loss": 0.5216, + "step": 2516 + }, + { + "epoch": 1.7454923717059638, + "grad_norm": 0.33663939923219494, + "learning_rate": 8.22597551061484e-06, + "loss": 0.4844, + "step": 2517 + }, + { + "epoch": 1.7461858529819696, + "grad_norm": 0.3199724563766412, + "learning_rate": 8.224125669773315e-06, + "loss": 0.4997, + "step": 2518 + }, + { + "epoch": 1.746879334257975, + "grad_norm": 0.3340021689703054, + "learning_rate": 8.222275073220087e-06, + "loss": 0.4821, + "step": 2519 + }, + { + "epoch": 1.7475728155339807, + "grad_norm": 0.3119035674803005, + "learning_rate": 8.220423721388918e-06, + "loss": 0.4826, + "step": 2520 + }, + { + "epoch": 1.748266296809986, + "grad_norm": 0.33039699110525106, + "learning_rate": 8.21857161471375e-06, + "loss": 0.4728, + "step": 2521 + }, + { + "epoch": 1.7489597780859918, + "grad_norm": 0.45355884487551595, + "learning_rate": 8.216718753628708e-06, + "loss": 0.5551, + "step": 2522 + }, + { + "epoch": 1.7496532593619971, + "grad_norm": 0.33963600367254804, + "learning_rate": 8.214865138568084e-06, + "loss": 0.5278, + "step": 2523 + }, + { + "epoch": 1.7503467406380029, + "grad_norm": 0.3505431172678928, + "learning_rate": 8.213010769966356e-06, + "loss": 0.5443, + "step": 2524 + }, + { + "epoch": 1.7510402219140082, + "grad_norm": 0.3206237977061909, + "learning_rate": 8.211155648258174e-06, + "loss": 0.4502, + "step": 2525 + }, + { + "epoch": 1.751733703190014, + "grad_norm": 0.30258900307978975, + "learning_rate": 8.209299773878366e-06, + "loss": 0.4519, + "step": 2526 + }, + { + "epoch": 1.7524271844660193, + "grad_norm": 0.3250013913618086, + "learning_rate": 8.20744314726193e-06, + "loss": 0.4612, + "step": 2527 + }, + { + "epoch": 1.753120665742025, + "grad_norm": 0.3573259112331063, + "learning_rate": 8.205585768844051e-06, + "loss": 0.5632, + "step": 2528 + }, + { + "epoch": 1.7538141470180304, + "grad_norm": 0.3875040213395464, + "learning_rate": 8.203727639060085e-06, + "loss": 0.5466, + "step": 2529 + }, + { + "epoch": 1.7545076282940362, + "grad_norm": 0.37836072095406464, + "learning_rate": 8.201868758345561e-06, + "loss": 0.551, + "step": 2530 + }, + { + "epoch": 1.7552011095700415, + "grad_norm": 0.3258967129982822, + "learning_rate": 8.200009127136192e-06, + "loss": 0.4895, + "step": 2531 + }, + { + "epoch": 1.7558945908460473, + "grad_norm": 0.34535841444900556, + "learning_rate": 8.198148745867855e-06, + "loss": 0.4943, + "step": 2532 + }, + { + "epoch": 1.7565880721220526, + "grad_norm": 0.29666217557456687, + "learning_rate": 8.196287614976617e-06, + "loss": 0.4573, + "step": 2533 + }, + { + "epoch": 1.7572815533980584, + "grad_norm": 0.3704184635178012, + "learning_rate": 8.19442573489871e-06, + "loss": 0.4879, + "step": 2534 + }, + { + "epoch": 1.7579750346740637, + "grad_norm": 0.36091698139889916, + "learning_rate": 8.192563106070547e-06, + "loss": 0.5343, + "step": 2535 + }, + { + "epoch": 1.7586685159500695, + "grad_norm": 0.3630942363096478, + "learning_rate": 8.190699728928712e-06, + "loss": 0.4714, + "step": 2536 + }, + { + "epoch": 1.7593619972260748, + "grad_norm": 0.3132017824288624, + "learning_rate": 8.188835603909967e-06, + "loss": 0.4992, + "step": 2537 + }, + { + "epoch": 1.7600554785020806, + "grad_norm": 0.3027009572027872, + "learning_rate": 8.186970731451255e-06, + "loss": 0.5029, + "step": 2538 + }, + { + "epoch": 1.7607489597780859, + "grad_norm": 0.313851498975355, + "learning_rate": 8.185105111989683e-06, + "loss": 0.4923, + "step": 2539 + }, + { + "epoch": 1.7614424410540916, + "grad_norm": 0.36778585480576975, + "learning_rate": 8.18323874596254e-06, + "loss": 0.5133, + "step": 2540 + }, + { + "epoch": 1.762135922330097, + "grad_norm": 0.3449764370113829, + "learning_rate": 8.181371633807289e-06, + "loss": 0.4989, + "step": 2541 + }, + { + "epoch": 1.7628294036061027, + "grad_norm": 0.33842515688554986, + "learning_rate": 8.179503775961569e-06, + "loss": 0.4814, + "step": 2542 + }, + { + "epoch": 1.763522884882108, + "grad_norm": 0.3313212544424528, + "learning_rate": 8.17763517286319e-06, + "loss": 0.4879, + "step": 2543 + }, + { + "epoch": 1.7642163661581138, + "grad_norm": 0.32778248906505236, + "learning_rate": 8.175765824950139e-06, + "loss": 0.4401, + "step": 2544 + }, + { + "epoch": 1.7649098474341192, + "grad_norm": 0.3599077775354727, + "learning_rate": 8.17389573266058e-06, + "loss": 0.4718, + "step": 2545 + }, + { + "epoch": 1.765603328710125, + "grad_norm": 0.3396295924894958, + "learning_rate": 8.172024896432847e-06, + "loss": 0.5057, + "step": 2546 + }, + { + "epoch": 1.7662968099861303, + "grad_norm": 0.3471878954839222, + "learning_rate": 8.17015331670545e-06, + "loss": 0.4972, + "step": 2547 + }, + { + "epoch": 1.766990291262136, + "grad_norm": 0.32736351081474324, + "learning_rate": 8.168280993917078e-06, + "loss": 0.4346, + "step": 2548 + }, + { + "epoch": 1.7676837725381414, + "grad_norm": 0.38675782685334803, + "learning_rate": 8.166407928506583e-06, + "loss": 0.5634, + "step": 2549 + }, + { + "epoch": 1.7683772538141471, + "grad_norm": 0.3683899472275578, + "learning_rate": 8.164534120913004e-06, + "loss": 0.5239, + "step": 2550 + }, + { + "epoch": 1.7690707350901524, + "grad_norm": 0.31560258879219105, + "learning_rate": 8.162659571575546e-06, + "loss": 0.416, + "step": 2551 + }, + { + "epoch": 1.7697642163661582, + "grad_norm": 0.3788950818694178, + "learning_rate": 8.160784280933589e-06, + "loss": 0.5286, + "step": 2552 + }, + { + "epoch": 1.7704576976421635, + "grad_norm": 0.3549915525328208, + "learning_rate": 8.158908249426687e-06, + "loss": 0.4567, + "step": 2553 + }, + { + "epoch": 1.7711511789181693, + "grad_norm": 0.3452211009361919, + "learning_rate": 8.15703147749457e-06, + "loss": 0.4821, + "step": 2554 + }, + { + "epoch": 1.7718446601941746, + "grad_norm": 0.333760562080443, + "learning_rate": 8.155153965577139e-06, + "loss": 0.4665, + "step": 2555 + }, + { + "epoch": 1.7725381414701804, + "grad_norm": 0.4003116848324873, + "learning_rate": 8.15327571411447e-06, + "loss": 0.4835, + "step": 2556 + }, + { + "epoch": 1.7732316227461857, + "grad_norm": 0.34523761601236286, + "learning_rate": 8.15139672354681e-06, + "loss": 0.494, + "step": 2557 + }, + { + "epoch": 1.7739251040221915, + "grad_norm": 0.3092718222272152, + "learning_rate": 8.149516994314581e-06, + "loss": 0.4647, + "step": 2558 + }, + { + "epoch": 1.7746185852981968, + "grad_norm": 0.33388856773233827, + "learning_rate": 8.14763652685838e-06, + "loss": 0.4958, + "step": 2559 + }, + { + "epoch": 1.7753120665742026, + "grad_norm": 0.3198893523900066, + "learning_rate": 8.145755321618972e-06, + "loss": 0.448, + "step": 2560 + }, + { + "epoch": 1.776005547850208, + "grad_norm": 0.31866305537905737, + "learning_rate": 8.1438733790373e-06, + "loss": 0.4403, + "step": 2561 + }, + { + "epoch": 1.7766990291262137, + "grad_norm": 0.3485110953024072, + "learning_rate": 8.141990699554476e-06, + "loss": 0.4709, + "step": 2562 + }, + { + "epoch": 1.777392510402219, + "grad_norm": 0.3164179012114437, + "learning_rate": 8.140107283611787e-06, + "loss": 0.4648, + "step": 2563 + }, + { + "epoch": 1.7780859916782248, + "grad_norm": 0.36956234103451036, + "learning_rate": 8.138223131650693e-06, + "loss": 0.5307, + "step": 2564 + }, + { + "epoch": 1.7787794729542301, + "grad_norm": 0.3437914379915076, + "learning_rate": 8.136338244112826e-06, + "loss": 0.4599, + "step": 2565 + }, + { + "epoch": 1.7794729542302359, + "grad_norm": 0.3439786188514912, + "learning_rate": 8.134452621439988e-06, + "loss": 0.4308, + "step": 2566 + }, + { + "epoch": 1.7801664355062412, + "grad_norm": 0.3311734097123211, + "learning_rate": 8.132566264074157e-06, + "loss": 0.5035, + "step": 2567 + }, + { + "epoch": 1.780859916782247, + "grad_norm": 0.37148619562143187, + "learning_rate": 8.13067917245748e-06, + "loss": 0.4722, + "step": 2568 + }, + { + "epoch": 1.7815533980582523, + "grad_norm": 0.6697161458880733, + "learning_rate": 8.12879134703228e-06, + "loss": 0.4492, + "step": 2569 + }, + { + "epoch": 1.782246879334258, + "grad_norm": 0.3512123657436848, + "learning_rate": 8.126902788241045e-06, + "loss": 0.4539, + "step": 2570 + }, + { + "epoch": 1.7829403606102634, + "grad_norm": 0.3621406791220599, + "learning_rate": 8.125013496526444e-06, + "loss": 0.4563, + "step": 2571 + }, + { + "epoch": 1.7836338418862692, + "grad_norm": 0.331242124044267, + "learning_rate": 8.123123472331314e-06, + "loss": 0.4574, + "step": 2572 + }, + { + "epoch": 1.7843273231622745, + "grad_norm": 0.31959757371424324, + "learning_rate": 8.121232716098659e-06, + "loss": 0.4923, + "step": 2573 + }, + { + "epoch": 1.7850208044382803, + "grad_norm": 0.4494412160848984, + "learning_rate": 8.11934122827166e-06, + "loss": 0.5015, + "step": 2574 + }, + { + "epoch": 1.7857142857142856, + "grad_norm": 0.34521632420014386, + "learning_rate": 8.117449009293668e-06, + "loss": 0.493, + "step": 2575 + }, + { + "epoch": 1.7864077669902914, + "grad_norm": 0.32728678861733757, + "learning_rate": 8.115556059608208e-06, + "loss": 0.4858, + "step": 2576 + }, + { + "epoch": 1.7871012482662967, + "grad_norm": 0.3582845788413125, + "learning_rate": 8.113662379658969e-06, + "loss": 0.4944, + "step": 2577 + }, + { + "epoch": 1.7877947295423025, + "grad_norm": 0.3717939194931279, + "learning_rate": 8.11176796988982e-06, + "loss": 0.5563, + "step": 2578 + }, + { + "epoch": 1.7884882108183078, + "grad_norm": 0.37431454154338295, + "learning_rate": 8.109872830744795e-06, + "loss": 0.5072, + "step": 2579 + }, + { + "epoch": 1.7891816920943135, + "grad_norm": 0.3510047894435186, + "learning_rate": 8.107976962668102e-06, + "loss": 0.497, + "step": 2580 + }, + { + "epoch": 1.7898751733703189, + "grad_norm": 0.3535079515486122, + "learning_rate": 8.106080366104117e-06, + "loss": 0.4557, + "step": 2581 + }, + { + "epoch": 1.7905686546463246, + "grad_norm": 0.3229653503595275, + "learning_rate": 8.104183041497389e-06, + "loss": 0.4905, + "step": 2582 + }, + { + "epoch": 1.79126213592233, + "grad_norm": 0.44770065352819477, + "learning_rate": 8.102284989292639e-06, + "loss": 0.6119, + "step": 2583 + }, + { + "epoch": 1.7919556171983357, + "grad_norm": 0.3357961525739306, + "learning_rate": 8.100386209934754e-06, + "loss": 0.4958, + "step": 2584 + }, + { + "epoch": 1.792649098474341, + "grad_norm": 0.308513659858812, + "learning_rate": 8.098486703868796e-06, + "loss": 0.4223, + "step": 2585 + }, + { + "epoch": 1.7933425797503468, + "grad_norm": 0.3214398273471279, + "learning_rate": 8.096586471539994e-06, + "loss": 0.4968, + "step": 2586 + }, + { + "epoch": 1.7940360610263522, + "grad_norm": 0.3420112961470549, + "learning_rate": 8.094685513393752e-06, + "loss": 0.473, + "step": 2587 + }, + { + "epoch": 1.794729542302358, + "grad_norm": 0.3666497150224497, + "learning_rate": 8.092783829875636e-06, + "loss": 0.5762, + "step": 2588 + }, + { + "epoch": 1.7954230235783633, + "grad_norm": 0.34547184429680877, + "learning_rate": 8.09088142143139e-06, + "loss": 0.5207, + "step": 2589 + }, + { + "epoch": 1.796116504854369, + "grad_norm": 0.3834786211695057, + "learning_rate": 8.088978288506923e-06, + "loss": 0.5767, + "step": 2590 + }, + { + "epoch": 1.7968099861303743, + "grad_norm": 0.33081319617384436, + "learning_rate": 8.08707443154832e-06, + "loss": 0.4609, + "step": 2591 + }, + { + "epoch": 1.7975034674063801, + "grad_norm": 0.33005505320516115, + "learning_rate": 8.085169851001825e-06, + "loss": 0.4168, + "step": 2592 + }, + { + "epoch": 1.7981969486823854, + "grad_norm": 0.35581289193530136, + "learning_rate": 8.083264547313863e-06, + "loss": 0.513, + "step": 2593 + }, + { + "epoch": 1.7988904299583912, + "grad_norm": 0.3492346033091902, + "learning_rate": 8.081358520931018e-06, + "loss": 0.5047, + "step": 2594 + }, + { + "epoch": 1.7995839112343965, + "grad_norm": 0.3772907702393545, + "learning_rate": 8.079451772300052e-06, + "loss": 0.472, + "step": 2595 + }, + { + "epoch": 1.8002773925104023, + "grad_norm": 0.360546535425315, + "learning_rate": 8.077544301867896e-06, + "loss": 0.4899, + "step": 2596 + }, + { + "epoch": 1.8009708737864076, + "grad_norm": 0.36631850143040506, + "learning_rate": 8.075636110081643e-06, + "loss": 0.5397, + "step": 2597 + }, + { + "epoch": 1.8016643550624134, + "grad_norm": 0.3629427258375066, + "learning_rate": 8.073727197388561e-06, + "loss": 0.498, + "step": 2598 + }, + { + "epoch": 1.8023578363384187, + "grad_norm": 0.3447326751967329, + "learning_rate": 8.071817564236084e-06, + "loss": 0.458, + "step": 2599 + }, + { + "epoch": 1.8030513176144245, + "grad_norm": 0.39934035769365833, + "learning_rate": 8.069907211071816e-06, + "loss": 0.509, + "step": 2600 + }, + { + "epoch": 1.8037447988904298, + "grad_norm": 0.3628951280754408, + "learning_rate": 8.067996138343535e-06, + "loss": 0.4864, + "step": 2601 + }, + { + "epoch": 1.8044382801664356, + "grad_norm": 0.3764346549515586, + "learning_rate": 8.066084346499176e-06, + "loss": 0.5411, + "step": 2602 + }, + { + "epoch": 1.805131761442441, + "grad_norm": 0.36142407024268214, + "learning_rate": 8.064171835986852e-06, + "loss": 0.5122, + "step": 2603 + }, + { + "epoch": 1.8058252427184467, + "grad_norm": 0.3329712865601718, + "learning_rate": 8.062258607254841e-06, + "loss": 0.4673, + "step": 2604 + }, + { + "epoch": 1.806518723994452, + "grad_norm": 0.33761714138991, + "learning_rate": 8.060344660751591e-06, + "loss": 0.4684, + "step": 2605 + }, + { + "epoch": 1.8072122052704578, + "grad_norm": 0.34647098619660877, + "learning_rate": 8.058429996925717e-06, + "loss": 0.5236, + "step": 2606 + }, + { + "epoch": 1.807905686546463, + "grad_norm": 0.3558401909155935, + "learning_rate": 8.056514616226001e-06, + "loss": 0.4761, + "step": 2607 + }, + { + "epoch": 1.8085991678224689, + "grad_norm": 0.34910942910293996, + "learning_rate": 8.054598519101396e-06, + "loss": 0.5036, + "step": 2608 + }, + { + "epoch": 1.8092926490984742, + "grad_norm": 0.3481902999440837, + "learning_rate": 8.05268170600102e-06, + "loss": 0.5216, + "step": 2609 + }, + { + "epoch": 1.80998613037448, + "grad_norm": 0.3400124519897086, + "learning_rate": 8.05076417737416e-06, + "loss": 0.491, + "step": 2610 + }, + { + "epoch": 1.8106796116504853, + "grad_norm": 0.3379953053496073, + "learning_rate": 8.048845933670274e-06, + "loss": 0.4978, + "step": 2611 + }, + { + "epoch": 1.811373092926491, + "grad_norm": 0.365658545970509, + "learning_rate": 8.046926975338978e-06, + "loss": 0.496, + "step": 2612 + }, + { + "epoch": 1.8120665742024964, + "grad_norm": 0.39904400921742555, + "learning_rate": 8.045007302830068e-06, + "loss": 0.4882, + "step": 2613 + }, + { + "epoch": 1.8127600554785022, + "grad_norm": 0.36452724045373436, + "learning_rate": 8.0430869165935e-06, + "loss": 0.521, + "step": 2614 + }, + { + "epoch": 1.8134535367545075, + "grad_norm": 0.36517647714668033, + "learning_rate": 8.041165817079397e-06, + "loss": 0.4932, + "step": 2615 + }, + { + "epoch": 1.8141470180305133, + "grad_norm": 0.33548186116342915, + "learning_rate": 8.039244004738051e-06, + "loss": 0.5209, + "step": 2616 + }, + { + "epoch": 1.8148404993065186, + "grad_norm": 0.3259302086648711, + "learning_rate": 8.037321480019921e-06, + "loss": 0.4927, + "step": 2617 + }, + { + "epoch": 1.8155339805825244, + "grad_norm": 0.2825621847555902, + "learning_rate": 8.035398243375636e-06, + "loss": 0.4603, + "step": 2618 + }, + { + "epoch": 1.8162274618585297, + "grad_norm": 0.37932516511922554, + "learning_rate": 8.033474295255986e-06, + "loss": 0.5119, + "step": 2619 + }, + { + "epoch": 1.8169209431345354, + "grad_norm": 0.33649712682234856, + "learning_rate": 8.031549636111928e-06, + "loss": 0.5313, + "step": 2620 + }, + { + "epoch": 1.8176144244105408, + "grad_norm": 0.36794769079990797, + "learning_rate": 8.029624266394592e-06, + "loss": 0.522, + "step": 2621 + }, + { + "epoch": 1.8183079056865465, + "grad_norm": 0.3534751022570253, + "learning_rate": 8.02769818655527e-06, + "loss": 0.4682, + "step": 2622 + }, + { + "epoch": 1.8190013869625519, + "grad_norm": 0.3583193431807195, + "learning_rate": 8.025771397045421e-06, + "loss": 0.5225, + "step": 2623 + }, + { + "epoch": 1.8196948682385576, + "grad_norm": 1.1135076037341172, + "learning_rate": 8.02384389831667e-06, + "loss": 0.5502, + "step": 2624 + }, + { + "epoch": 1.820388349514563, + "grad_norm": 0.324063155336584, + "learning_rate": 8.021915690820808e-06, + "loss": 0.4955, + "step": 2625 + }, + { + "epoch": 1.8210818307905687, + "grad_norm": 0.3496820628240374, + "learning_rate": 8.019986775009795e-06, + "loss": 0.4811, + "step": 2626 + }, + { + "epoch": 1.821775312066574, + "grad_norm": 0.3447998554678887, + "learning_rate": 8.018057151335752e-06, + "loss": 0.5276, + "step": 2627 + }, + { + "epoch": 1.8224687933425798, + "grad_norm": 0.3412124628965962, + "learning_rate": 8.016126820250972e-06, + "loss": 0.5099, + "step": 2628 + }, + { + "epoch": 1.8231622746185852, + "grad_norm": 0.35021409461541153, + "learning_rate": 8.01419578220791e-06, + "loss": 0.4749, + "step": 2629 + }, + { + "epoch": 1.823855755894591, + "grad_norm": 0.3339581098924237, + "learning_rate": 8.012264037659182e-06, + "loss": 0.4893, + "step": 2630 + }, + { + "epoch": 1.8245492371705962, + "grad_norm": 0.3514154973100117, + "learning_rate": 8.010331587057585e-06, + "loss": 0.4829, + "step": 2631 + }, + { + "epoch": 1.825242718446602, + "grad_norm": 0.36606702754495957, + "learning_rate": 8.008398430856064e-06, + "loss": 0.5437, + "step": 2632 + }, + { + "epoch": 1.8259361997226073, + "grad_norm": 0.3572356362069411, + "learning_rate": 8.006464569507737e-06, + "loss": 0.5267, + "step": 2633 + }, + { + "epoch": 1.8266296809986131, + "grad_norm": 0.36667852303146975, + "learning_rate": 8.004530003465891e-06, + "loss": 0.5074, + "step": 2634 + }, + { + "epoch": 1.8273231622746184, + "grad_norm": 0.3157941877578861, + "learning_rate": 8.002594733183971e-06, + "loss": 0.459, + "step": 2635 + }, + { + "epoch": 1.8280166435506242, + "grad_norm": 0.4112073131139841, + "learning_rate": 8.00065875911559e-06, + "loss": 0.4834, + "step": 2636 + }, + { + "epoch": 1.8287101248266295, + "grad_norm": 0.33776473552815095, + "learning_rate": 7.99872208171453e-06, + "loss": 0.4747, + "step": 2637 + }, + { + "epoch": 1.8294036061026353, + "grad_norm": 0.33433379311687755, + "learning_rate": 7.99678470143473e-06, + "loss": 0.4464, + "step": 2638 + }, + { + "epoch": 1.8300970873786406, + "grad_norm": 0.3425070302442074, + "learning_rate": 7.994846618730301e-06, + "loss": 0.5321, + "step": 2639 + }, + { + "epoch": 1.8307905686546464, + "grad_norm": 0.35198261413571635, + "learning_rate": 7.992907834055513e-06, + "loss": 0.5237, + "step": 2640 + }, + { + "epoch": 1.8314840499306517, + "grad_norm": 0.3276940133077291, + "learning_rate": 7.990968347864804e-06, + "loss": 0.4824, + "step": 2641 + }, + { + "epoch": 1.8321775312066575, + "grad_norm": 0.36387677266071716, + "learning_rate": 7.989028160612779e-06, + "loss": 0.4771, + "step": 2642 + }, + { + "epoch": 1.8328710124826628, + "grad_norm": 0.3856124357512831, + "learning_rate": 7.987087272754199e-06, + "loss": 0.5188, + "step": 2643 + }, + { + "epoch": 1.8335644937586686, + "grad_norm": 0.32502975843033866, + "learning_rate": 7.985145684743993e-06, + "loss": 0.4257, + "step": 2644 + }, + { + "epoch": 1.834257975034674, + "grad_norm": 0.3684901686346137, + "learning_rate": 7.983203397037261e-06, + "loss": 0.4641, + "step": 2645 + }, + { + "epoch": 1.8349514563106797, + "grad_norm": 0.3530964106730837, + "learning_rate": 7.981260410089258e-06, + "loss": 0.569, + "step": 2646 + }, + { + "epoch": 1.835644937586685, + "grad_norm": 0.45330775632682163, + "learning_rate": 7.979316724355406e-06, + "loss": 0.5673, + "step": 2647 + }, + { + "epoch": 1.8363384188626908, + "grad_norm": 0.34899484897550276, + "learning_rate": 7.97737234029129e-06, + "loss": 0.5419, + "step": 2648 + }, + { + "epoch": 1.837031900138696, + "grad_norm": 0.36946601962265596, + "learning_rate": 7.97542725835266e-06, + "loss": 0.4677, + "step": 2649 + }, + { + "epoch": 1.8377253814147019, + "grad_norm": 0.32280200454263647, + "learning_rate": 7.973481478995433e-06, + "loss": 0.4647, + "step": 2650 + }, + { + "epoch": 1.8384188626907072, + "grad_norm": 0.3540458704749818, + "learning_rate": 7.97153500267568e-06, + "loss": 0.4987, + "step": 2651 + }, + { + "epoch": 1.839112343966713, + "grad_norm": 0.34560866179222777, + "learning_rate": 7.969587829849644e-06, + "loss": 0.4661, + "step": 2652 + }, + { + "epoch": 1.8398058252427183, + "grad_norm": 0.41567913358775405, + "learning_rate": 7.967639960973727e-06, + "loss": 0.5251, + "step": 2653 + }, + { + "epoch": 1.840499306518724, + "grad_norm": 0.3361099075546891, + "learning_rate": 7.965691396504496e-06, + "loss": 0.5489, + "step": 2654 + }, + { + "epoch": 1.8411927877947294, + "grad_norm": 0.3196820195209324, + "learning_rate": 7.96374213689868e-06, + "loss": 0.4696, + "step": 2655 + }, + { + "epoch": 1.8418862690707352, + "grad_norm": 0.33633959801435626, + "learning_rate": 7.96179218261317e-06, + "loss": 0.4781, + "step": 2656 + }, + { + "epoch": 1.8425797503467405, + "grad_norm": 0.3434890969631632, + "learning_rate": 7.959841534105026e-06, + "loss": 0.4616, + "step": 2657 + }, + { + "epoch": 1.8432732316227463, + "grad_norm": 0.340579571076245, + "learning_rate": 7.95789019183146e-06, + "loss": 0.4676, + "step": 2658 + }, + { + "epoch": 1.8439667128987516, + "grad_norm": 0.3941390329546165, + "learning_rate": 7.955938156249856e-06, + "loss": 0.5465, + "step": 2659 + }, + { + "epoch": 1.8446601941747574, + "grad_norm": 0.34716317499059784, + "learning_rate": 7.953985427817757e-06, + "loss": 0.4605, + "step": 2660 + }, + { + "epoch": 1.8453536754507627, + "grad_norm": 0.3739178657087966, + "learning_rate": 7.952032006992865e-06, + "loss": 0.5196, + "step": 2661 + }, + { + "epoch": 1.8460471567267684, + "grad_norm": 0.3835030440585209, + "learning_rate": 7.950077894233051e-06, + "loss": 0.4727, + "step": 2662 + }, + { + "epoch": 1.8467406380027738, + "grad_norm": 0.34474733547803854, + "learning_rate": 7.948123089996345e-06, + "loss": 0.444, + "step": 2663 + }, + { + "epoch": 1.8474341192787795, + "grad_norm": 0.38383795280040944, + "learning_rate": 7.946167594740938e-06, + "loss": 0.5605, + "step": 2664 + }, + { + "epoch": 1.8481276005547849, + "grad_norm": 0.3827559486940056, + "learning_rate": 7.944211408925184e-06, + "loss": 0.5903, + "step": 2665 + }, + { + "epoch": 1.8488210818307906, + "grad_norm": 0.3487061847158636, + "learning_rate": 7.942254533007597e-06, + "loss": 0.5603, + "step": 2666 + }, + { + "epoch": 1.849514563106796, + "grad_norm": 0.3170466391970917, + "learning_rate": 7.94029696744686e-06, + "loss": 0.4559, + "step": 2667 + }, + { + "epoch": 1.8502080443828017, + "grad_norm": 0.37841259797232923, + "learning_rate": 7.938338712701805e-06, + "loss": 0.513, + "step": 2668 + }, + { + "epoch": 1.850901525658807, + "grad_norm": 0.3661516732247181, + "learning_rate": 7.936379769231436e-06, + "loss": 0.5056, + "step": 2669 + }, + { + "epoch": 1.8515950069348128, + "grad_norm": 0.3288161189708249, + "learning_rate": 7.934420137494917e-06, + "loss": 0.4683, + "step": 2670 + }, + { + "epoch": 1.8522884882108182, + "grad_norm": 0.42463178798517265, + "learning_rate": 7.93245981795157e-06, + "loss": 0.4649, + "step": 2671 + }, + { + "epoch": 1.852981969486824, + "grad_norm": 0.32121837261737696, + "learning_rate": 7.930498811060879e-06, + "loss": 0.4566, + "step": 2672 + }, + { + "epoch": 1.8536754507628292, + "grad_norm": 0.33612977400716404, + "learning_rate": 7.92853711728249e-06, + "loss": 0.4899, + "step": 2673 + }, + { + "epoch": 1.854368932038835, + "grad_norm": 0.34811870221940905, + "learning_rate": 7.92657473707621e-06, + "loss": 0.4946, + "step": 2674 + }, + { + "epoch": 1.8550624133148403, + "grad_norm": 0.45890294460618614, + "learning_rate": 7.924611670902008e-06, + "loss": 0.4614, + "step": 2675 + }, + { + "epoch": 1.8557558945908461, + "grad_norm": 0.3438035043765943, + "learning_rate": 7.92264791922001e-06, + "loss": 0.4855, + "step": 2676 + }, + { + "epoch": 1.8564493758668514, + "grad_norm": 0.3354919563928631, + "learning_rate": 7.92068348249051e-06, + "loss": 0.4654, + "step": 2677 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 0.33641671481189644, + "learning_rate": 7.918718361173951e-06, + "loss": 0.4763, + "step": 2678 + }, + { + "epoch": 1.8578363384188625, + "grad_norm": 0.32552095581966767, + "learning_rate": 7.916752555730946e-06, + "loss": 0.4618, + "step": 2679 + }, + { + "epoch": 1.8585298196948683, + "grad_norm": 0.43523484512876065, + "learning_rate": 7.914786066622268e-06, + "loss": 0.4844, + "step": 2680 + }, + { + "epoch": 1.8592233009708736, + "grad_norm": 0.30467028020508086, + "learning_rate": 7.912818894308845e-06, + "loss": 0.4429, + "step": 2681 + }, + { + "epoch": 1.8599167822468794, + "grad_norm": 0.3227150624697633, + "learning_rate": 7.910851039251768e-06, + "loss": 0.4908, + "step": 2682 + }, + { + "epoch": 1.8606102635228847, + "grad_norm": 0.33384052436968936, + "learning_rate": 7.908882501912289e-06, + "loss": 0.4476, + "step": 2683 + }, + { + "epoch": 1.8613037447988905, + "grad_norm": 0.3714640930412473, + "learning_rate": 7.90691328275182e-06, + "loss": 0.4669, + "step": 2684 + }, + { + "epoch": 1.8619972260748958, + "grad_norm": 0.3087761374772754, + "learning_rate": 7.90494338223193e-06, + "loss": 0.4333, + "step": 2685 + }, + { + "epoch": 1.8626907073509016, + "grad_norm": 0.36038541634353743, + "learning_rate": 7.90297280081435e-06, + "loss": 0.4655, + "step": 2686 + }, + { + "epoch": 1.863384188626907, + "grad_norm": 0.3465616915030342, + "learning_rate": 7.901001538960968e-06, + "loss": 0.4793, + "step": 2687 + }, + { + "epoch": 1.8640776699029127, + "grad_norm": 0.33590736181306824, + "learning_rate": 7.899029597133836e-06, + "loss": 0.4266, + "step": 2688 + }, + { + "epoch": 1.864771151178918, + "grad_norm": 0.31658115418413885, + "learning_rate": 7.897056975795163e-06, + "loss": 0.4345, + "step": 2689 + }, + { + "epoch": 1.8654646324549238, + "grad_norm": 0.34315383712519093, + "learning_rate": 7.895083675407316e-06, + "loss": 0.4863, + "step": 2690 + }, + { + "epoch": 1.866158113730929, + "grad_norm": 0.34222369247746415, + "learning_rate": 7.893109696432824e-06, + "loss": 0.539, + "step": 2691 + }, + { + "epoch": 1.8668515950069349, + "grad_norm": 0.33203178507466397, + "learning_rate": 7.89113503933437e-06, + "loss": 0.4939, + "step": 2692 + }, + { + "epoch": 1.8675450762829402, + "grad_norm": 0.36536191361376724, + "learning_rate": 7.889159704574803e-06, + "loss": 0.4936, + "step": 2693 + }, + { + "epoch": 1.868238557558946, + "grad_norm": 0.33926462202525187, + "learning_rate": 7.887183692617125e-06, + "loss": 0.4961, + "step": 2694 + }, + { + "epoch": 1.8689320388349513, + "grad_norm": 0.35716225248485955, + "learning_rate": 7.885207003924498e-06, + "loss": 0.5053, + "step": 2695 + }, + { + "epoch": 1.869625520110957, + "grad_norm": 0.3258382192439157, + "learning_rate": 7.883229638960246e-06, + "loss": 0.4656, + "step": 2696 + }, + { + "epoch": 1.8703190013869624, + "grad_norm": 0.34032656016382073, + "learning_rate": 7.881251598187848e-06, + "loss": 0.5263, + "step": 2697 + }, + { + "epoch": 1.8710124826629682, + "grad_norm": 0.36476247725557487, + "learning_rate": 7.879272882070942e-06, + "loss": 0.5284, + "step": 2698 + }, + { + "epoch": 1.8717059639389735, + "grad_norm": 0.3724168644047688, + "learning_rate": 7.877293491073325e-06, + "loss": 0.4892, + "step": 2699 + }, + { + "epoch": 1.8723994452149793, + "grad_norm": 0.38303081629382435, + "learning_rate": 7.875313425658955e-06, + "loss": 0.4655, + "step": 2700 + }, + { + "epoch": 1.8730929264909846, + "grad_norm": 0.3091757085341014, + "learning_rate": 7.873332686291939e-06, + "loss": 0.4978, + "step": 2701 + }, + { + "epoch": 1.8737864077669903, + "grad_norm": 0.3370798668074445, + "learning_rate": 7.87135127343655e-06, + "loss": 0.4282, + "step": 2702 + }, + { + "epoch": 1.8744798890429957, + "grad_norm": 0.3379343713716113, + "learning_rate": 7.86936918755722e-06, + "loss": 0.4275, + "step": 2703 + }, + { + "epoch": 1.8751733703190014, + "grad_norm": 0.3587033618478191, + "learning_rate": 7.867386429118533e-06, + "loss": 0.4646, + "step": 2704 + }, + { + "epoch": 1.875866851595007, + "grad_norm": 0.3645155839705385, + "learning_rate": 7.865402998585234e-06, + "loss": 0.535, + "step": 2705 + }, + { + "epoch": 1.8765603328710125, + "grad_norm": 0.3685151032638385, + "learning_rate": 7.863418896422223e-06, + "loss": 0.5579, + "step": 2706 + }, + { + "epoch": 1.877253814147018, + "grad_norm": 0.3315053069842496, + "learning_rate": 7.86143412309456e-06, + "loss": 0.4734, + "step": 2707 + }, + { + "epoch": 1.8779472954230236, + "grad_norm": 0.4015552806113017, + "learning_rate": 7.859448679067465e-06, + "loss": 0.5254, + "step": 2708 + }, + { + "epoch": 1.8786407766990292, + "grad_norm": 0.38131337231205575, + "learning_rate": 7.857462564806306e-06, + "loss": 0.5127, + "step": 2709 + }, + { + "epoch": 1.8793342579750347, + "grad_norm": 0.31246765375989466, + "learning_rate": 7.855475780776618e-06, + "loss": 0.4548, + "step": 2710 + }, + { + "epoch": 1.8800277392510403, + "grad_norm": 0.3371769696481026, + "learning_rate": 7.853488327444085e-06, + "loss": 0.5073, + "step": 2711 + }, + { + "epoch": 1.8807212205270458, + "grad_norm": 0.38750013754468865, + "learning_rate": 7.851500205274556e-06, + "loss": 0.549, + "step": 2712 + }, + { + "epoch": 1.8814147018030514, + "grad_norm": 0.3704598385623532, + "learning_rate": 7.849511414734031e-06, + "loss": 0.4886, + "step": 2713 + }, + { + "epoch": 1.882108183079057, + "grad_norm": 0.3680224526882509, + "learning_rate": 7.847521956288667e-06, + "loss": 0.4985, + "step": 2714 + }, + { + "epoch": 1.8828016643550625, + "grad_norm": 0.32918344876562927, + "learning_rate": 7.845531830404779e-06, + "loss": 0.4803, + "step": 2715 + }, + { + "epoch": 1.883495145631068, + "grad_norm": 0.3624569516420403, + "learning_rate": 7.84354103754884e-06, + "loss": 0.4642, + "step": 2716 + }, + { + "epoch": 1.8841886269070736, + "grad_norm": 0.31070697654179563, + "learning_rate": 7.841549578187472e-06, + "loss": 0.4699, + "step": 2717 + }, + { + "epoch": 1.884882108183079, + "grad_norm": 0.35520493491293814, + "learning_rate": 7.839557452787465e-06, + "loss": 0.5316, + "step": 2718 + }, + { + "epoch": 1.8855755894590847, + "grad_norm": 0.3570877311507715, + "learning_rate": 7.837564661815755e-06, + "loss": 0.4928, + "step": 2719 + }, + { + "epoch": 1.8862690707350902, + "grad_norm": 0.36233011008533733, + "learning_rate": 7.835571205739438e-06, + "loss": 0.4261, + "step": 2720 + }, + { + "epoch": 1.8869625520110958, + "grad_norm": 0.33391896874979543, + "learning_rate": 7.833577085025768e-06, + "loss": 0.4577, + "step": 2721 + }, + { + "epoch": 1.8876560332871013, + "grad_norm": 0.3316454736722919, + "learning_rate": 7.831582300142151e-06, + "loss": 0.5035, + "step": 2722 + }, + { + "epoch": 1.8883495145631068, + "grad_norm": 0.3850383328808543, + "learning_rate": 7.82958685155615e-06, + "loss": 0.4705, + "step": 2723 + }, + { + "epoch": 1.8890429958391124, + "grad_norm": 0.36773321773866596, + "learning_rate": 7.827590739735483e-06, + "loss": 0.4951, + "step": 2724 + }, + { + "epoch": 1.889736477115118, + "grad_norm": 0.4207221739505761, + "learning_rate": 7.825593965148027e-06, + "loss": 0.5365, + "step": 2725 + }, + { + "epoch": 1.8904299583911235, + "grad_norm": 0.3405928361906911, + "learning_rate": 7.823596528261808e-06, + "loss": 0.5039, + "step": 2726 + }, + { + "epoch": 1.891123439667129, + "grad_norm": 0.3453213609250516, + "learning_rate": 7.821598429545011e-06, + "loss": 0.484, + "step": 2727 + }, + { + "epoch": 1.8918169209431346, + "grad_norm": 0.3330266683001898, + "learning_rate": 7.819599669465979e-06, + "loss": 0.4482, + "step": 2728 + }, + { + "epoch": 1.8925104022191401, + "grad_norm": 0.3650550445029445, + "learning_rate": 7.817600248493205e-06, + "loss": 0.5187, + "step": 2729 + }, + { + "epoch": 1.8932038834951457, + "grad_norm": 0.358788925844345, + "learning_rate": 7.815600167095338e-06, + "loss": 0.5525, + "step": 2730 + }, + { + "epoch": 1.8938973647711512, + "grad_norm": 0.32218523816677525, + "learning_rate": 7.813599425741183e-06, + "loss": 0.4722, + "step": 2731 + }, + { + "epoch": 1.8945908460471568, + "grad_norm": 0.3456065736851976, + "learning_rate": 7.8115980248997e-06, + "loss": 0.5284, + "step": 2732 + }, + { + "epoch": 1.8952843273231623, + "grad_norm": 0.3073424730771214, + "learning_rate": 7.809595965040002e-06, + "loss": 0.4835, + "step": 2733 + }, + { + "epoch": 1.8959778085991679, + "grad_norm": 0.3547197711091453, + "learning_rate": 7.80759324663136e-06, + "loss": 0.4857, + "step": 2734 + }, + { + "epoch": 1.8966712898751734, + "grad_norm": 0.3563225377706775, + "learning_rate": 7.805589870143193e-06, + "loss": 0.5348, + "step": 2735 + }, + { + "epoch": 1.897364771151179, + "grad_norm": 0.3603077268316012, + "learning_rate": 7.80358583604508e-06, + "loss": 0.4679, + "step": 2736 + }, + { + "epoch": 1.8980582524271845, + "grad_norm": 0.3724255462235156, + "learning_rate": 7.801581144806752e-06, + "loss": 0.5121, + "step": 2737 + }, + { + "epoch": 1.89875173370319, + "grad_norm": 0.3706606750205616, + "learning_rate": 7.799575796898091e-06, + "loss": 0.4483, + "step": 2738 + }, + { + "epoch": 1.8994452149791956, + "grad_norm": 0.3514347423782457, + "learning_rate": 7.797569792789142e-06, + "loss": 0.4928, + "step": 2739 + }, + { + "epoch": 1.9001386962552012, + "grad_norm": 0.3723169049743826, + "learning_rate": 7.795563132950092e-06, + "loss": 0.5125, + "step": 2740 + }, + { + "epoch": 1.9008321775312067, + "grad_norm": 0.3568896023824196, + "learning_rate": 7.79355581785129e-06, + "loss": 0.5542, + "step": 2741 + }, + { + "epoch": 1.9015256588072122, + "grad_norm": 0.3478798711559774, + "learning_rate": 7.791547847963237e-06, + "loss": 0.5586, + "step": 2742 + }, + { + "epoch": 1.9022191400832178, + "grad_norm": 0.3853549614511584, + "learning_rate": 7.789539223756588e-06, + "loss": 0.5559, + "step": 2743 + }, + { + "epoch": 1.9029126213592233, + "grad_norm": 0.3342837293657271, + "learning_rate": 7.787529945702145e-06, + "loss": 0.4549, + "step": 2744 + }, + { + "epoch": 1.903606102635229, + "grad_norm": 0.32087241344602785, + "learning_rate": 7.785520014270872e-06, + "loss": 0.4428, + "step": 2745 + }, + { + "epoch": 1.9042995839112344, + "grad_norm": 0.3186490035876303, + "learning_rate": 7.78350942993388e-06, + "loss": 0.455, + "step": 2746 + }, + { + "epoch": 1.90499306518724, + "grad_norm": 0.31509241310589053, + "learning_rate": 7.781498193162438e-06, + "loss": 0.4628, + "step": 2747 + }, + { + "epoch": 1.9056865464632455, + "grad_norm": 0.3553360158393702, + "learning_rate": 7.779486304427963e-06, + "loss": 0.5081, + "step": 2748 + }, + { + "epoch": 1.906380027739251, + "grad_norm": 0.3566553822952972, + "learning_rate": 7.77747376420203e-06, + "loss": 0.5137, + "step": 2749 + }, + { + "epoch": 1.9070735090152566, + "grad_norm": 0.3722863664868598, + "learning_rate": 7.775460572956361e-06, + "loss": 0.5842, + "step": 2750 + }, + { + "epoch": 1.9077669902912622, + "grad_norm": 0.3543461820550464, + "learning_rate": 7.773446731162835e-06, + "loss": 0.4936, + "step": 2751 + }, + { + "epoch": 1.9084604715672677, + "grad_norm": 0.32936054817425064, + "learning_rate": 7.771432239293481e-06, + "loss": 0.514, + "step": 2752 + }, + { + "epoch": 1.9091539528432733, + "grad_norm": 0.3136306514466323, + "learning_rate": 7.769417097820481e-06, + "loss": 0.4096, + "step": 2753 + }, + { + "epoch": 1.9098474341192788, + "grad_norm": 0.3516954106782875, + "learning_rate": 7.767401307216172e-06, + "loss": 0.4762, + "step": 2754 + }, + { + "epoch": 1.9105409153952844, + "grad_norm": 0.9020798668799241, + "learning_rate": 7.765384867953038e-06, + "loss": 0.4829, + "step": 2755 + }, + { + "epoch": 1.91123439667129, + "grad_norm": 0.3463716414672137, + "learning_rate": 7.763367780503719e-06, + "loss": 0.5018, + "step": 2756 + }, + { + "epoch": 1.9119278779472955, + "grad_norm": 0.38085216185148424, + "learning_rate": 7.761350045341008e-06, + "loss": 0.5202, + "step": 2757 + }, + { + "epoch": 1.912621359223301, + "grad_norm": 0.3353323256554222, + "learning_rate": 7.759331662937841e-06, + "loss": 0.4762, + "step": 2758 + }, + { + "epoch": 1.9133148404993066, + "grad_norm": 0.34961297442086087, + "learning_rate": 7.757312633767318e-06, + "loss": 0.4961, + "step": 2759 + }, + { + "epoch": 1.914008321775312, + "grad_norm": 0.362212565909931, + "learning_rate": 7.755292958302683e-06, + "loss": 0.4875, + "step": 2760 + }, + { + "epoch": 1.9147018030513177, + "grad_norm": 0.32826385896123367, + "learning_rate": 7.753272637017333e-06, + "loss": 0.4604, + "step": 2761 + }, + { + "epoch": 1.9153952843273232, + "grad_norm": 0.3843771857848706, + "learning_rate": 7.751251670384818e-06, + "loss": 0.4562, + "step": 2762 + }, + { + "epoch": 1.9160887656033287, + "grad_norm": 0.33096456418013687, + "learning_rate": 7.749230058878836e-06, + "loss": 0.5049, + "step": 2763 + }, + { + "epoch": 1.9167822468793343, + "grad_norm": 0.39058020155023987, + "learning_rate": 7.74720780297324e-06, + "loss": 0.5548, + "step": 2764 + }, + { + "epoch": 1.9174757281553398, + "grad_norm": 0.33683295308446404, + "learning_rate": 7.745184903142029e-06, + "loss": 0.4681, + "step": 2765 + }, + { + "epoch": 1.9181692094313454, + "grad_norm": 0.34708818095467187, + "learning_rate": 7.74316135985936e-06, + "loss": 0.4647, + "step": 2766 + }, + { + "epoch": 1.918862690707351, + "grad_norm": 0.4075859168031003, + "learning_rate": 7.741137173599535e-06, + "loss": 0.5048, + "step": 2767 + }, + { + "epoch": 1.9195561719833565, + "grad_norm": 0.38467243108916777, + "learning_rate": 7.73911234483701e-06, + "loss": 0.5401, + "step": 2768 + }, + { + "epoch": 1.920249653259362, + "grad_norm": 0.32085151563485503, + "learning_rate": 7.737086874046387e-06, + "loss": 0.4932, + "step": 2769 + }, + { + "epoch": 1.9209431345353676, + "grad_norm": 0.38608992478875187, + "learning_rate": 7.735060761702425e-06, + "loss": 0.4777, + "step": 2770 + }, + { + "epoch": 1.9216366158113731, + "grad_norm": 0.35737555911257957, + "learning_rate": 7.733034008280027e-06, + "loss": 0.5644, + "step": 2771 + }, + { + "epoch": 1.9223300970873787, + "grad_norm": 0.3064138618678669, + "learning_rate": 7.731006614254252e-06, + "loss": 0.4465, + "step": 2772 + }, + { + "epoch": 1.9230235783633842, + "grad_norm": 0.3286043684447899, + "learning_rate": 7.728978580100304e-06, + "loss": 0.4594, + "step": 2773 + }, + { + "epoch": 1.9237170596393898, + "grad_norm": 0.3768782061453021, + "learning_rate": 7.726949906293544e-06, + "loss": 0.4771, + "step": 2774 + }, + { + "epoch": 1.9244105409153953, + "grad_norm": 0.364390945751054, + "learning_rate": 7.724920593309474e-06, + "loss": 0.5307, + "step": 2775 + }, + { + "epoch": 1.9251040221914009, + "grad_norm": 0.40577039878012, + "learning_rate": 7.722890641623752e-06, + "loss": 0.518, + "step": 2776 + }, + { + "epoch": 1.9257975034674064, + "grad_norm": 0.33581686460478327, + "learning_rate": 7.720860051712183e-06, + "loss": 0.4859, + "step": 2777 + }, + { + "epoch": 1.926490984743412, + "grad_norm": 0.46543626832142315, + "learning_rate": 7.718828824050722e-06, + "loss": 0.4795, + "step": 2778 + }, + { + "epoch": 1.9271844660194175, + "grad_norm": 0.35468745606402036, + "learning_rate": 7.716796959115479e-06, + "loss": 0.4779, + "step": 2779 + }, + { + "epoch": 1.927877947295423, + "grad_norm": 0.3788801104084805, + "learning_rate": 7.714764457382702e-06, + "loss": 0.5067, + "step": 2780 + }, + { + "epoch": 1.9285714285714286, + "grad_norm": 0.364114949919868, + "learning_rate": 7.712731319328798e-06, + "loss": 0.4666, + "step": 2781 + }, + { + "epoch": 1.9292649098474342, + "grad_norm": 0.3718239953310221, + "learning_rate": 7.71069754543032e-06, + "loss": 0.4694, + "step": 2782 + }, + { + "epoch": 1.9299583911234397, + "grad_norm": 0.36806678477105553, + "learning_rate": 7.708663136163967e-06, + "loss": 0.5129, + "step": 2783 + }, + { + "epoch": 1.9306518723994452, + "grad_norm": 0.326818402462561, + "learning_rate": 7.706628092006594e-06, + "loss": 0.4398, + "step": 2784 + }, + { + "epoch": 1.9313453536754508, + "grad_norm": 0.3223274110953189, + "learning_rate": 7.7045924134352e-06, + "loss": 0.4829, + "step": 2785 + }, + { + "epoch": 1.9320388349514563, + "grad_norm": 0.3977613310599147, + "learning_rate": 7.70255610092693e-06, + "loss": 0.5025, + "step": 2786 + }, + { + "epoch": 1.9327323162274619, + "grad_norm": 0.3254185114008018, + "learning_rate": 7.700519154959081e-06, + "loss": 0.5241, + "step": 2787 + }, + { + "epoch": 1.9334257975034674, + "grad_norm": 0.3183943101006737, + "learning_rate": 7.698481576009102e-06, + "loss": 0.4771, + "step": 2788 + }, + { + "epoch": 1.934119278779473, + "grad_norm": 0.36246545128956614, + "learning_rate": 7.696443364554584e-06, + "loss": 0.4734, + "step": 2789 + }, + { + "epoch": 1.9348127600554785, + "grad_norm": 0.3484678616251616, + "learning_rate": 7.694404521073273e-06, + "loss": 0.495, + "step": 2790 + }, + { + "epoch": 1.935506241331484, + "grad_norm": 0.3668175506071657, + "learning_rate": 7.692365046043053e-06, + "loss": 0.4889, + "step": 2791 + }, + { + "epoch": 1.9361997226074896, + "grad_norm": 0.3463910744025464, + "learning_rate": 7.690324939941964e-06, + "loss": 0.5102, + "step": 2792 + }, + { + "epoch": 1.9368932038834952, + "grad_norm": 0.35160264974419514, + "learning_rate": 7.688284203248197e-06, + "loss": 0.4805, + "step": 2793 + }, + { + "epoch": 1.9375866851595007, + "grad_norm": 0.3512818815864015, + "learning_rate": 7.686242836440081e-06, + "loss": 0.4844, + "step": 2794 + }, + { + "epoch": 1.9382801664355063, + "grad_norm": 0.4050336666596714, + "learning_rate": 7.684200839996099e-06, + "loss": 0.5719, + "step": 2795 + }, + { + "epoch": 1.9389736477115118, + "grad_norm": 0.36132159999999863, + "learning_rate": 7.682158214394878e-06, + "loss": 0.5198, + "step": 2796 + }, + { + "epoch": 1.9396671289875174, + "grad_norm": 0.3638888963990286, + "learning_rate": 7.680114960115198e-06, + "loss": 0.5061, + "step": 2797 + }, + { + "epoch": 1.940360610263523, + "grad_norm": 0.33836739852120684, + "learning_rate": 7.678071077635981e-06, + "loss": 0.5627, + "step": 2798 + }, + { + "epoch": 1.9410540915395285, + "grad_norm": 0.3396319108032648, + "learning_rate": 7.676026567436301e-06, + "loss": 0.4828, + "step": 2799 + }, + { + "epoch": 1.941747572815534, + "grad_norm": 0.3530008796442068, + "learning_rate": 7.673981429995372e-06, + "loss": 0.5012, + "step": 2800 + }, + { + "epoch": 1.9424410540915396, + "grad_norm": 0.38349473669569306, + "learning_rate": 7.671935665792563e-06, + "loss": 0.5359, + "step": 2801 + }, + { + "epoch": 1.943134535367545, + "grad_norm": 0.3742179351323327, + "learning_rate": 7.669889275307384e-06, + "loss": 0.4488, + "step": 2802 + }, + { + "epoch": 1.9438280166435506, + "grad_norm": 0.3367164020056258, + "learning_rate": 7.667842259019495e-06, + "loss": 0.4947, + "step": 2803 + }, + { + "epoch": 1.9445214979195562, + "grad_norm": 0.3188297402918376, + "learning_rate": 7.665794617408703e-06, + "loss": 0.4908, + "step": 2804 + }, + { + "epoch": 1.9452149791955617, + "grad_norm": 0.34876096692317154, + "learning_rate": 7.663746350954957e-06, + "loss": 0.4422, + "step": 2805 + }, + { + "epoch": 1.9459084604715673, + "grad_norm": 0.47167493465687316, + "learning_rate": 7.661697460138362e-06, + "loss": 0.4648, + "step": 2806 + }, + { + "epoch": 1.9466019417475728, + "grad_norm": 0.3340119781612177, + "learning_rate": 7.659647945439157e-06, + "loss": 0.491, + "step": 2807 + }, + { + "epoch": 1.9472954230235784, + "grad_norm": 0.35635117982991515, + "learning_rate": 7.657597807337735e-06, + "loss": 0.4986, + "step": 2808 + }, + { + "epoch": 1.947988904299584, + "grad_norm": 0.3099310922886787, + "learning_rate": 7.655547046314635e-06, + "loss": 0.4429, + "step": 2809 + }, + { + "epoch": 1.9486823855755895, + "grad_norm": 0.335658427457914, + "learning_rate": 7.65349566285054e-06, + "loss": 0.4762, + "step": 2810 + }, + { + "epoch": 1.949375866851595, + "grad_norm": 0.36516184574224014, + "learning_rate": 7.651443657426279e-06, + "loss": 0.4886, + "step": 2811 + }, + { + "epoch": 1.9500693481276006, + "grad_norm": 0.3334221155337291, + "learning_rate": 7.649391030522828e-06, + "loss": 0.4539, + "step": 2812 + }, + { + "epoch": 1.9507628294036061, + "grad_norm": 0.3628991245166753, + "learning_rate": 7.647337782621308e-06, + "loss": 0.5172, + "step": 2813 + }, + { + "epoch": 1.9514563106796117, + "grad_norm": 0.3456110006514414, + "learning_rate": 7.645283914202981e-06, + "loss": 0.4333, + "step": 2814 + }, + { + "epoch": 1.9521497919556172, + "grad_norm": 0.31888828958161214, + "learning_rate": 7.643229425749265e-06, + "loss": 0.4501, + "step": 2815 + }, + { + "epoch": 1.9528432732316228, + "grad_norm": 0.3622790204057956, + "learning_rate": 7.641174317741716e-06, + "loss": 0.4869, + "step": 2816 + }, + { + "epoch": 1.9535367545076283, + "grad_norm": 0.3419542518804844, + "learning_rate": 7.639118590662033e-06, + "loss": 0.5091, + "step": 2817 + }, + { + "epoch": 1.9542302357836339, + "grad_norm": 0.3599515415634431, + "learning_rate": 7.637062244992065e-06, + "loss": 0.487, + "step": 2818 + }, + { + "epoch": 1.9549237170596394, + "grad_norm": 0.4461919939769592, + "learning_rate": 7.635005281213808e-06, + "loss": 0.5199, + "step": 2819 + }, + { + "epoch": 1.955617198335645, + "grad_norm": 0.3228715182026137, + "learning_rate": 7.632947699809395e-06, + "loss": 0.4787, + "step": 2820 + }, + { + "epoch": 1.9563106796116505, + "grad_norm": 0.3538224220591444, + "learning_rate": 7.63088950126111e-06, + "loss": 0.5129, + "step": 2821 + }, + { + "epoch": 1.957004160887656, + "grad_norm": 0.32099229921982103, + "learning_rate": 7.6288306860513804e-06, + "loss": 0.4536, + "step": 2822 + }, + { + "epoch": 1.9576976421636616, + "grad_norm": 0.3698865009052739, + "learning_rate": 7.626771254662776e-06, + "loss": 0.5426, + "step": 2823 + }, + { + "epoch": 1.9583911234396671, + "grad_norm": 0.32399056399673626, + "learning_rate": 7.624711207578015e-06, + "loss": 0.4455, + "step": 2824 + }, + { + "epoch": 1.9590846047156727, + "grad_norm": 0.29898733204345623, + "learning_rate": 7.622650545279954e-06, + "loss": 0.4694, + "step": 2825 + }, + { + "epoch": 1.9597780859916782, + "grad_norm": 0.3401556086147256, + "learning_rate": 7.6205892682516e-06, + "loss": 0.4763, + "step": 2826 + }, + { + "epoch": 1.9604715672676838, + "grad_norm": 0.36168017056263757, + "learning_rate": 7.6185273769761015e-06, + "loss": 0.4781, + "step": 2827 + }, + { + "epoch": 1.9611650485436893, + "grad_norm": 0.3569963385716288, + "learning_rate": 7.616464871936748e-06, + "loss": 0.5229, + "step": 2828 + }, + { + "epoch": 1.9618585298196949, + "grad_norm": 0.5028901182808291, + "learning_rate": 7.61440175361698e-06, + "loss": 0.5477, + "step": 2829 + }, + { + "epoch": 1.9625520110957004, + "grad_norm": 0.3862379396196428, + "learning_rate": 7.612338022500375e-06, + "loss": 0.5427, + "step": 2830 + }, + { + "epoch": 1.963245492371706, + "grad_norm": 0.3175003879623789, + "learning_rate": 7.6102736790706575e-06, + "loss": 0.4993, + "step": 2831 + }, + { + "epoch": 1.9639389736477115, + "grad_norm": 0.375401967459228, + "learning_rate": 7.608208723811693e-06, + "loss": 0.4883, + "step": 2832 + }, + { + "epoch": 1.964632454923717, + "grad_norm": 0.3399053720557915, + "learning_rate": 7.606143157207493e-06, + "loss": 0.5016, + "step": 2833 + }, + { + "epoch": 1.9653259361997226, + "grad_norm": 0.41120677640801506, + "learning_rate": 7.604076979742212e-06, + "loss": 0.5139, + "step": 2834 + }, + { + "epoch": 1.9660194174757282, + "grad_norm": 0.3378315201752423, + "learning_rate": 7.602010191900147e-06, + "loss": 0.4636, + "step": 2835 + }, + { + "epoch": 1.9667128987517337, + "grad_norm": 0.33524211782526003, + "learning_rate": 7.599942794165738e-06, + "loss": 0.5007, + "step": 2836 + }, + { + "epoch": 1.9674063800277393, + "grad_norm": 0.3440131442432083, + "learning_rate": 7.597874787023565e-06, + "loss": 0.5117, + "step": 2837 + }, + { + "epoch": 1.9680998613037448, + "grad_norm": 0.34427218589372166, + "learning_rate": 7.59580617095836e-06, + "loss": 0.4298, + "step": 2838 + }, + { + "epoch": 1.9687933425797504, + "grad_norm": 0.3498259579987603, + "learning_rate": 7.593736946454986e-06, + "loss": 0.4979, + "step": 2839 + }, + { + "epoch": 1.969486823855756, + "grad_norm": 0.31895484693353576, + "learning_rate": 7.591667113998458e-06, + "loss": 0.4851, + "step": 2840 + }, + { + "epoch": 1.9701803051317615, + "grad_norm": 0.3017840035013218, + "learning_rate": 7.589596674073927e-06, + "loss": 0.4467, + "step": 2841 + }, + { + "epoch": 1.970873786407767, + "grad_norm": 0.37740803352290486, + "learning_rate": 7.587525627166691e-06, + "loss": 0.4592, + "step": 2842 + }, + { + "epoch": 1.9715672676837726, + "grad_norm": 0.36929200701190473, + "learning_rate": 7.585453973762188e-06, + "loss": 0.4949, + "step": 2843 + }, + { + "epoch": 1.972260748959778, + "grad_norm": 0.32701615822251884, + "learning_rate": 7.583381714345999e-06, + "loss": 0.5043, + "step": 2844 + }, + { + "epoch": 1.9729542302357836, + "grad_norm": 0.38039091569376876, + "learning_rate": 7.581308849403843e-06, + "loss": 0.4584, + "step": 2845 + }, + { + "epoch": 1.9736477115117892, + "grad_norm": 0.3740860012794304, + "learning_rate": 7.5792353794215885e-06, + "loss": 0.5545, + "step": 2846 + }, + { + "epoch": 1.9743411927877947, + "grad_norm": 0.31243044334493275, + "learning_rate": 7.577161304885242e-06, + "loss": 0.4663, + "step": 2847 + }, + { + "epoch": 1.9750346740638003, + "grad_norm": 0.3380881854703883, + "learning_rate": 7.575086626280951e-06, + "loss": 0.4747, + "step": 2848 + }, + { + "epoch": 1.9757281553398058, + "grad_norm": 0.37490035147652145, + "learning_rate": 7.573011344095002e-06, + "loss": 0.4645, + "step": 2849 + }, + { + "epoch": 1.9764216366158114, + "grad_norm": 0.3485353886235364, + "learning_rate": 7.5709354588138296e-06, + "loss": 0.517, + "step": 2850 + }, + { + "epoch": 1.977115117891817, + "grad_norm": 0.3654716702171952, + "learning_rate": 7.568858970924006e-06, + "loss": 0.5205, + "step": 2851 + }, + { + "epoch": 1.9778085991678225, + "grad_norm": 0.3639664302165563, + "learning_rate": 7.566781880912244e-06, + "loss": 0.4633, + "step": 2852 + }, + { + "epoch": 1.978502080443828, + "grad_norm": 0.34349319790231575, + "learning_rate": 7.564704189265397e-06, + "loss": 0.4813, + "step": 2853 + }, + { + "epoch": 1.9791955617198336, + "grad_norm": 0.3361512833833259, + "learning_rate": 7.5626258964704634e-06, + "loss": 0.4829, + "step": 2854 + }, + { + "epoch": 1.9798890429958391, + "grad_norm": 0.33618350305353856, + "learning_rate": 7.56054700301458e-06, + "loss": 0.4414, + "step": 2855 + }, + { + "epoch": 1.9805825242718447, + "grad_norm": 0.6431040010129025, + "learning_rate": 7.558467509385023e-06, + "loss": 0.4686, + "step": 2856 + }, + { + "epoch": 1.9812760055478502, + "grad_norm": 0.30993907367331835, + "learning_rate": 7.5563874160692105e-06, + "loss": 0.4699, + "step": 2857 + }, + { + "epoch": 1.9819694868238558, + "grad_norm": 0.3230497093400171, + "learning_rate": 7.554306723554702e-06, + "loss": 0.4792, + "step": 2858 + }, + { + "epoch": 1.9826629680998613, + "grad_norm": 0.38336764278694396, + "learning_rate": 7.552225432329196e-06, + "loss": 0.4523, + "step": 2859 + }, + { + "epoch": 1.9833564493758669, + "grad_norm": 0.3489004199044597, + "learning_rate": 7.5501435428805345e-06, + "loss": 0.4555, + "step": 2860 + }, + { + "epoch": 1.9840499306518724, + "grad_norm": 0.3386246511215877, + "learning_rate": 7.548061055696696e-06, + "loss": 0.504, + "step": 2861 + }, + { + "epoch": 1.984743411927878, + "grad_norm": 0.3369402473820756, + "learning_rate": 7.545977971265799e-06, + "loss": 0.4438, + "step": 2862 + }, + { + "epoch": 1.9854368932038835, + "grad_norm": 0.30702527260405243, + "learning_rate": 7.5438942900761035e-06, + "loss": 0.4479, + "step": 2863 + }, + { + "epoch": 1.986130374479889, + "grad_norm": 0.31792988533349564, + "learning_rate": 7.541810012616011e-06, + "loss": 0.4393, + "step": 2864 + }, + { + "epoch": 1.9868238557558946, + "grad_norm": 0.331116213260764, + "learning_rate": 7.53972513937406e-06, + "loss": 0.5213, + "step": 2865 + }, + { + "epoch": 1.9875173370319001, + "grad_norm": 0.35078977085481916, + "learning_rate": 7.53763967083893e-06, + "loss": 0.4797, + "step": 2866 + }, + { + "epoch": 1.9882108183079057, + "grad_norm": 0.34163638207233277, + "learning_rate": 7.535553607499438e-06, + "loss": 0.4716, + "step": 2867 + }, + { + "epoch": 1.9889042995839112, + "grad_norm": 0.33940154559423774, + "learning_rate": 7.5334669498445454e-06, + "loss": 0.5085, + "step": 2868 + }, + { + "epoch": 1.9895977808599168, + "grad_norm": 0.3333087061478696, + "learning_rate": 7.531379698363348e-06, + "loss": 0.5063, + "step": 2869 + }, + { + "epoch": 1.9902912621359223, + "grad_norm": 0.3649615626540554, + "learning_rate": 7.529291853545082e-06, + "loss": 0.4847, + "step": 2870 + }, + { + "epoch": 1.9909847434119279, + "grad_norm": 0.7002542693293046, + "learning_rate": 7.527203415879125e-06, + "loss": 0.482, + "step": 2871 + }, + { + "epoch": 1.9916782246879334, + "grad_norm": 0.345445271912478, + "learning_rate": 7.525114385854988e-06, + "loss": 0.4726, + "step": 2872 + }, + { + "epoch": 1.992371705963939, + "grad_norm": 0.3549213985304822, + "learning_rate": 7.523024763962328e-06, + "loss": 0.4848, + "step": 2873 + }, + { + "epoch": 1.9930651872399445, + "grad_norm": 0.41542124087712934, + "learning_rate": 7.5209345506909346e-06, + "loss": 0.4491, + "step": 2874 + }, + { + "epoch": 1.99375866851595, + "grad_norm": 0.34636334795885454, + "learning_rate": 7.5188437465307415e-06, + "loss": 0.5321, + "step": 2875 + }, + { + "epoch": 1.9944521497919556, + "grad_norm": 0.34611763216721525, + "learning_rate": 7.5167523519718155e-06, + "loss": 0.4643, + "step": 2876 + }, + { + "epoch": 1.9951456310679612, + "grad_norm": 0.3557109697301609, + "learning_rate": 7.514660367504368e-06, + "loss": 0.4692, + "step": 2877 + }, + { + "epoch": 1.9958391123439667, + "grad_norm": 0.4323397683262994, + "learning_rate": 7.512567793618738e-06, + "loss": 0.4938, + "step": 2878 + }, + { + "epoch": 1.9965325936199723, + "grad_norm": 0.3561912840695031, + "learning_rate": 7.5104746308054165e-06, + "loss": 0.4396, + "step": 2879 + }, + { + "epoch": 1.9972260748959778, + "grad_norm": 0.34254835637409276, + "learning_rate": 7.508380879555024e-06, + "loss": 0.5006, + "step": 2880 + }, + { + "epoch": 1.9979195561719834, + "grad_norm": 0.38501081852223085, + "learning_rate": 7.506286540358318e-06, + "loss": 0.4974, + "step": 2881 + }, + { + "epoch": 1.998613037447989, + "grad_norm": 0.3069095824305059, + "learning_rate": 7.5041916137062e-06, + "loss": 0.4315, + "step": 2882 + }, + { + "epoch": 1.9993065187239945, + "grad_norm": 0.36986927906164246, + "learning_rate": 7.502096100089702e-06, + "loss": 0.5097, + "step": 2883 + }, + { + "epoch": 2.0, + "grad_norm": 0.3397957650017145, + "learning_rate": 7.500000000000001e-06, + "loss": 0.5056, + "step": 2884 + }, + { + "epoch": 2.0006934812760058, + "grad_norm": 0.7146834103015289, + "learning_rate": 7.497903313928405e-06, + "loss": 0.4523, + "step": 2885 + }, + { + "epoch": 2.001386962552011, + "grad_norm": 0.6180314412448197, + "learning_rate": 7.495806042366363e-06, + "loss": 0.4056, + "step": 2886 + }, + { + "epoch": 2.002080443828017, + "grad_norm": 0.29687979662832115, + "learning_rate": 7.493708185805459e-06, + "loss": 0.348, + "step": 2887 + }, + { + "epoch": 2.002773925104022, + "grad_norm": 0.3614447867498667, + "learning_rate": 7.4916097447374194e-06, + "loss": 0.4276, + "step": 2888 + }, + { + "epoch": 2.003467406380028, + "grad_norm": 0.4058802506700272, + "learning_rate": 7.4895107196541e-06, + "loss": 0.441, + "step": 2889 + }, + { + "epoch": 2.0041608876560333, + "grad_norm": 0.3420417130227295, + "learning_rate": 7.4874111110474955e-06, + "loss": 0.4234, + "step": 2890 + }, + { + "epoch": 2.004854368932039, + "grad_norm": 0.38846000992080426, + "learning_rate": 7.485310919409742e-06, + "loss": 0.4373, + "step": 2891 + }, + { + "epoch": 2.0055478502080444, + "grad_norm": 0.3623736861191617, + "learning_rate": 7.48321014523311e-06, + "loss": 0.4106, + "step": 2892 + }, + { + "epoch": 2.00624133148405, + "grad_norm": 0.3642800458680427, + "learning_rate": 7.481108789010003e-06, + "loss": 0.433, + "step": 2893 + }, + { + "epoch": 2.0069348127600555, + "grad_norm": 0.36524010205980534, + "learning_rate": 7.479006851232965e-06, + "loss": 0.4436, + "step": 2894 + }, + { + "epoch": 2.0076282940360612, + "grad_norm": 0.3429227978002902, + "learning_rate": 7.4769043323946746e-06, + "loss": 0.428, + "step": 2895 + }, + { + "epoch": 2.0083217753120666, + "grad_norm": 0.35987936381576785, + "learning_rate": 7.474801232987948e-06, + "loss": 0.4653, + "step": 2896 + }, + { + "epoch": 2.0090152565880723, + "grad_norm": 0.332590635210399, + "learning_rate": 7.472697553505736e-06, + "loss": 0.3612, + "step": 2897 + }, + { + "epoch": 2.0097087378640777, + "grad_norm": 0.3490804631637133, + "learning_rate": 7.470593294441124e-06, + "loss": 0.4747, + "step": 2898 + }, + { + "epoch": 2.0104022191400834, + "grad_norm": 0.4125588583396638, + "learning_rate": 7.4684884562873375e-06, + "loss": 0.4373, + "step": 2899 + }, + { + "epoch": 2.0110957004160888, + "grad_norm": 0.34728420195893356, + "learning_rate": 7.466383039537735e-06, + "loss": 0.4514, + "step": 2900 + }, + { + "epoch": 2.0117891816920945, + "grad_norm": 0.33132310844990925, + "learning_rate": 7.46427704468581e-06, + "loss": 0.3716, + "step": 2901 + }, + { + "epoch": 2.0124826629681, + "grad_norm": 0.37546869539441957, + "learning_rate": 7.462170472225194e-06, + "loss": 0.4145, + "step": 2902 + }, + { + "epoch": 2.0131761442441056, + "grad_norm": 0.4315665627849969, + "learning_rate": 7.4600633226496485e-06, + "loss": 0.4715, + "step": 2903 + }, + { + "epoch": 2.013869625520111, + "grad_norm": 0.3437342558007185, + "learning_rate": 7.4579555964530795e-06, + "loss": 0.3959, + "step": 2904 + }, + { + "epoch": 2.0145631067961167, + "grad_norm": 0.34810430787405483, + "learning_rate": 7.455847294129519e-06, + "loss": 0.4211, + "step": 2905 + }, + { + "epoch": 2.015256588072122, + "grad_norm": 0.37888935111198124, + "learning_rate": 7.453738416173139e-06, + "loss": 0.4392, + "step": 2906 + }, + { + "epoch": 2.015950069348128, + "grad_norm": 0.4734181119166493, + "learning_rate": 7.451628963078245e-06, + "loss": 0.3913, + "step": 2907 + }, + { + "epoch": 2.016643550624133, + "grad_norm": 0.34919890086071553, + "learning_rate": 7.449518935339276e-06, + "loss": 0.4501, + "step": 2908 + }, + { + "epoch": 2.017337031900139, + "grad_norm": 0.3703329840280313, + "learning_rate": 7.447408333450811e-06, + "loss": 0.5084, + "step": 2909 + }, + { + "epoch": 2.0180305131761442, + "grad_norm": 0.3525130938500406, + "learning_rate": 7.445297157907557e-06, + "loss": 0.4458, + "step": 2910 + }, + { + "epoch": 2.01872399445215, + "grad_norm": 0.37238934327990236, + "learning_rate": 7.443185409204359e-06, + "loss": 0.4836, + "step": 2911 + }, + { + "epoch": 2.0194174757281553, + "grad_norm": 0.361261183126272, + "learning_rate": 7.4410730878361936e-06, + "loss": 0.407, + "step": 2912 + }, + { + "epoch": 2.020110957004161, + "grad_norm": 0.37564813665004965, + "learning_rate": 7.438960194298178e-06, + "loss": 0.4606, + "step": 2913 + }, + { + "epoch": 2.0208044382801664, + "grad_norm": 0.3352371928825262, + "learning_rate": 7.436846729085556e-06, + "loss": 0.4128, + "step": 2914 + }, + { + "epoch": 2.021497919556172, + "grad_norm": 0.4015755166593209, + "learning_rate": 7.434732692693708e-06, + "loss": 0.459, + "step": 2915 + }, + { + "epoch": 2.0221914008321775, + "grad_norm": 0.36629021998122735, + "learning_rate": 7.432618085618152e-06, + "loss": 0.4833, + "step": 2916 + }, + { + "epoch": 2.0228848821081833, + "grad_norm": 0.3774399648148289, + "learning_rate": 7.430502908354532e-06, + "loss": 0.4579, + "step": 2917 + }, + { + "epoch": 2.0235783633841886, + "grad_norm": 0.37726337936849413, + "learning_rate": 7.428387161398635e-06, + "loss": 0.3907, + "step": 2918 + }, + { + "epoch": 2.0242718446601944, + "grad_norm": 0.34428719555303283, + "learning_rate": 7.426270845246373e-06, + "loss": 0.4596, + "step": 2919 + }, + { + "epoch": 2.0249653259361997, + "grad_norm": 0.33825621691918, + "learning_rate": 7.424153960393798e-06, + "loss": 0.4181, + "step": 2920 + }, + { + "epoch": 2.0256588072122055, + "grad_norm": 0.33770870571869777, + "learning_rate": 7.42203650733709e-06, + "loss": 0.4201, + "step": 2921 + }, + { + "epoch": 2.026352288488211, + "grad_norm": 0.3687196751817226, + "learning_rate": 7.419918486572568e-06, + "loss": 0.3984, + "step": 2922 + }, + { + "epoch": 2.0270457697642166, + "grad_norm": 0.42522280921204547, + "learning_rate": 7.417799898596676e-06, + "loss": 0.4014, + "step": 2923 + }, + { + "epoch": 2.027739251040222, + "grad_norm": 0.3531416005902581, + "learning_rate": 7.415680743906001e-06, + "loss": 0.3954, + "step": 2924 + }, + { + "epoch": 2.0284327323162277, + "grad_norm": 0.3540475558874119, + "learning_rate": 7.413561022997253e-06, + "loss": 0.4317, + "step": 2925 + }, + { + "epoch": 2.029126213592233, + "grad_norm": 0.3910373342437148, + "learning_rate": 7.411440736367281e-06, + "loss": 0.4213, + "step": 2926 + }, + { + "epoch": 2.0298196948682388, + "grad_norm": 0.35841904355533144, + "learning_rate": 7.4093198845130666e-06, + "loss": 0.4636, + "step": 2927 + }, + { + "epoch": 2.030513176144244, + "grad_norm": 0.3835539168709186, + "learning_rate": 7.407198467931718e-06, + "loss": 0.4555, + "step": 2928 + }, + { + "epoch": 2.03120665742025, + "grad_norm": 0.3556215946429487, + "learning_rate": 7.405076487120484e-06, + "loss": 0.4876, + "step": 2929 + }, + { + "epoch": 2.031900138696255, + "grad_norm": 0.39795708209783903, + "learning_rate": 7.402953942576738e-06, + "loss": 0.4596, + "step": 2930 + }, + { + "epoch": 2.032593619972261, + "grad_norm": 0.35453873412030984, + "learning_rate": 7.400830834797993e-06, + "loss": 0.4156, + "step": 2931 + }, + { + "epoch": 2.0332871012482663, + "grad_norm": 0.36434897526467763, + "learning_rate": 7.398707164281887e-06, + "loss": 0.4472, + "step": 2932 + }, + { + "epoch": 2.033980582524272, + "grad_norm": 0.35989755605118917, + "learning_rate": 7.396582931526194e-06, + "loss": 0.4459, + "step": 2933 + }, + { + "epoch": 2.0346740638002774, + "grad_norm": 0.38672833092948306, + "learning_rate": 7.394458137028818e-06, + "loss": 0.4441, + "step": 2934 + }, + { + "epoch": 2.035367545076283, + "grad_norm": 0.32193021063115684, + "learning_rate": 7.392332781287798e-06, + "loss": 0.4076, + "step": 2935 + }, + { + "epoch": 2.0360610263522885, + "grad_norm": 0.3484149984725337, + "learning_rate": 7.390206864801298e-06, + "loss": 0.4895, + "step": 2936 + }, + { + "epoch": 2.0367545076282942, + "grad_norm": 0.37047634197176654, + "learning_rate": 7.388080388067621e-06, + "loss": 0.457, + "step": 2937 + }, + { + "epoch": 2.0374479889042996, + "grad_norm": 0.35108587056971163, + "learning_rate": 7.3859533515851955e-06, + "loss": 0.4553, + "step": 2938 + }, + { + "epoch": 2.0381414701803053, + "grad_norm": 0.3703643889712212, + "learning_rate": 7.383825755852585e-06, + "loss": 0.483, + "step": 2939 + }, + { + "epoch": 2.0388349514563107, + "grad_norm": 0.3630344879653978, + "learning_rate": 7.381697601368481e-06, + "loss": 0.4121, + "step": 2940 + }, + { + "epoch": 2.0395284327323164, + "grad_norm": 0.3302945348077549, + "learning_rate": 7.37956888863171e-06, + "loss": 0.4192, + "step": 2941 + }, + { + "epoch": 2.0402219140083218, + "grad_norm": 0.33815624190891896, + "learning_rate": 7.3774396181412235e-06, + "loss": 0.4395, + "step": 2942 + }, + { + "epoch": 2.0409153952843275, + "grad_norm": 0.4470430985635722, + "learning_rate": 7.375309790396108e-06, + "loss": 0.3777, + "step": 2943 + }, + { + "epoch": 2.041608876560333, + "grad_norm": 0.36314519372452403, + "learning_rate": 7.373179405895582e-06, + "loss": 0.4091, + "step": 2944 + }, + { + "epoch": 2.0423023578363386, + "grad_norm": 0.3673466840452378, + "learning_rate": 7.37104846513899e-06, + "loss": 0.453, + "step": 2945 + }, + { + "epoch": 2.042995839112344, + "grad_norm": 0.3522877191809448, + "learning_rate": 7.3689169686258096e-06, + "loss": 0.4332, + "step": 2946 + }, + { + "epoch": 2.0436893203883497, + "grad_norm": 0.37520075390767993, + "learning_rate": 7.36678491685565e-06, + "loss": 0.4642, + "step": 2947 + }, + { + "epoch": 2.044382801664355, + "grad_norm": 0.3578421132400116, + "learning_rate": 7.364652310328244e-06, + "loss": 0.4158, + "step": 2948 + }, + { + "epoch": 2.045076282940361, + "grad_norm": 0.3632511967074461, + "learning_rate": 7.362519149543464e-06, + "loss": 0.467, + "step": 2949 + }, + { + "epoch": 2.045769764216366, + "grad_norm": 0.355434381152111, + "learning_rate": 7.360385435001306e-06, + "loss": 0.4266, + "step": 2950 + }, + { + "epoch": 2.046463245492372, + "grad_norm": 0.38927581263690186, + "learning_rate": 7.358251167201896e-06, + "loss": 0.5017, + "step": 2951 + }, + { + "epoch": 2.0471567267683772, + "grad_norm": 0.35293879801406647, + "learning_rate": 7.356116346645491e-06, + "loss": 0.4259, + "step": 2952 + }, + { + "epoch": 2.047850208044383, + "grad_norm": 0.3249376480950482, + "learning_rate": 7.353980973832479e-06, + "loss": 0.3646, + "step": 2953 + }, + { + "epoch": 2.0485436893203883, + "grad_norm": 0.35646154115316836, + "learning_rate": 7.351845049263374e-06, + "loss": 0.4893, + "step": 2954 + }, + { + "epoch": 2.049237170596394, + "grad_norm": 0.3946491137363803, + "learning_rate": 7.349708573438824e-06, + "loss": 0.4026, + "step": 2955 + }, + { + "epoch": 2.0499306518723994, + "grad_norm": 0.42958126090594545, + "learning_rate": 7.3475715468596e-06, + "loss": 0.4654, + "step": 2956 + }, + { + "epoch": 2.050624133148405, + "grad_norm": 0.37665937795516446, + "learning_rate": 7.345433970026607e-06, + "loss": 0.3921, + "step": 2957 + }, + { + "epoch": 2.0513176144244105, + "grad_norm": 0.36466148379806207, + "learning_rate": 7.3432958434408806e-06, + "loss": 0.4573, + "step": 2958 + }, + { + "epoch": 2.0520110957004163, + "grad_norm": 0.3213186297254778, + "learning_rate": 7.341157167603579e-06, + "loss": 0.441, + "step": 2959 + }, + { + "epoch": 2.0527045769764216, + "grad_norm": 0.36908593397004924, + "learning_rate": 7.33901794301599e-06, + "loss": 0.4569, + "step": 2960 + }, + { + "epoch": 2.0533980582524274, + "grad_norm": 0.36794370552676486, + "learning_rate": 7.3368781701795365e-06, + "loss": 0.4259, + "step": 2961 + }, + { + "epoch": 2.0540915395284327, + "grad_norm": 0.3894273342876289, + "learning_rate": 7.3347378495957655e-06, + "loss": 0.3893, + "step": 2962 + }, + { + "epoch": 2.0547850208044385, + "grad_norm": 0.34623911584994665, + "learning_rate": 7.332596981766351e-06, + "loss": 0.4332, + "step": 2963 + }, + { + "epoch": 2.055478502080444, + "grad_norm": 0.34570402910906584, + "learning_rate": 7.330455567193095e-06, + "loss": 0.4265, + "step": 2964 + }, + { + "epoch": 2.0561719833564496, + "grad_norm": 0.35371820203398097, + "learning_rate": 7.328313606377936e-06, + "loss": 0.4415, + "step": 2965 + }, + { + "epoch": 2.056865464632455, + "grad_norm": 0.34726814199184525, + "learning_rate": 7.326171099822928e-06, + "loss": 0.4295, + "step": 2966 + }, + { + "epoch": 2.0575589459084607, + "grad_norm": 0.32356472950830084, + "learning_rate": 7.324028048030261e-06, + "loss": 0.3873, + "step": 2967 + }, + { + "epoch": 2.058252427184466, + "grad_norm": 0.38854895295362485, + "learning_rate": 7.321884451502252e-06, + "loss": 0.4454, + "step": 2968 + }, + { + "epoch": 2.0589459084604718, + "grad_norm": 0.34463233013673783, + "learning_rate": 7.319740310741342e-06, + "loss": 0.4627, + "step": 2969 + }, + { + "epoch": 2.059639389736477, + "grad_norm": 0.3522306545160089, + "learning_rate": 7.3175956262501035e-06, + "loss": 0.4286, + "step": 2970 + }, + { + "epoch": 2.060332871012483, + "grad_norm": 0.36957615734998583, + "learning_rate": 7.3154503985312366e-06, + "loss": 0.4693, + "step": 2971 + }, + { + "epoch": 2.061026352288488, + "grad_norm": 0.3656599956480459, + "learning_rate": 7.313304628087566e-06, + "loss": 0.4578, + "step": 2972 + }, + { + "epoch": 2.061719833564494, + "grad_norm": 0.40913095723021503, + "learning_rate": 7.311158315422041e-06, + "loss": 0.4654, + "step": 2973 + }, + { + "epoch": 2.0624133148404993, + "grad_norm": 0.3586285719040597, + "learning_rate": 7.309011461037749e-06, + "loss": 0.4297, + "step": 2974 + }, + { + "epoch": 2.063106796116505, + "grad_norm": 0.5045937777824696, + "learning_rate": 7.30686406543789e-06, + "loss": 0.4833, + "step": 2975 + }, + { + "epoch": 2.0638002773925104, + "grad_norm": 1.0804079677102074, + "learning_rate": 7.304716129125803e-06, + "loss": 0.4735, + "step": 2976 + }, + { + "epoch": 2.064493758668516, + "grad_norm": 0.3309159134019149, + "learning_rate": 7.302567652604945e-06, + "loss": 0.3879, + "step": 2977 + }, + { + "epoch": 2.0651872399445215, + "grad_norm": 0.37351848375722413, + "learning_rate": 7.300418636378907e-06, + "loss": 0.3958, + "step": 2978 + }, + { + "epoch": 2.0658807212205272, + "grad_norm": 0.3927459166930335, + "learning_rate": 7.2982690809514e-06, + "loss": 0.396, + "step": 2979 + }, + { + "epoch": 2.0665742024965326, + "grad_norm": 0.399726667879919, + "learning_rate": 7.296118986826266e-06, + "loss": 0.4771, + "step": 2980 + }, + { + "epoch": 2.0672676837725383, + "grad_norm": 0.36708468950617, + "learning_rate": 7.29396835450747e-06, + "loss": 0.4551, + "step": 2981 + }, + { + "epoch": 2.0679611650485437, + "grad_norm": 0.3902038836383935, + "learning_rate": 7.291817184499107e-06, + "loss": 0.4051, + "step": 2982 + }, + { + "epoch": 2.0686546463245494, + "grad_norm": 0.3954656237623094, + "learning_rate": 7.289665477305393e-06, + "loss": 0.4873, + "step": 2983 + }, + { + "epoch": 2.0693481276005548, + "grad_norm": 0.39177952831587315, + "learning_rate": 7.287513233430674e-06, + "loss": 0.4209, + "step": 2984 + }, + { + "epoch": 2.0700416088765605, + "grad_norm": 0.37088711369824945, + "learning_rate": 7.285360453379418e-06, + "loss": 0.4154, + "step": 2985 + }, + { + "epoch": 2.070735090152566, + "grad_norm": 0.36442790991643875, + "learning_rate": 7.283207137656226e-06, + "loss": 0.4753, + "step": 2986 + }, + { + "epoch": 2.0714285714285716, + "grad_norm": 0.41817756911810805, + "learning_rate": 7.281053286765816e-06, + "loss": 0.4955, + "step": 2987 + }, + { + "epoch": 2.072122052704577, + "grad_norm": 0.3751629121032016, + "learning_rate": 7.278898901213035e-06, + "loss": 0.4896, + "step": 2988 + }, + { + "epoch": 2.0728155339805827, + "grad_norm": 0.3294482776197221, + "learning_rate": 7.276743981502856e-06, + "loss": 0.3993, + "step": 2989 + }, + { + "epoch": 2.073509015256588, + "grad_norm": 0.3636883644535339, + "learning_rate": 7.274588528140378e-06, + "loss": 0.4578, + "step": 2990 + }, + { + "epoch": 2.074202496532594, + "grad_norm": 0.38090055533168865, + "learning_rate": 7.27243254163082e-06, + "loss": 0.4367, + "step": 2991 + }, + { + "epoch": 2.074895977808599, + "grad_norm": 0.3376329026686144, + "learning_rate": 7.270276022479534e-06, + "loss": 0.4756, + "step": 2992 + }, + { + "epoch": 2.075589459084605, + "grad_norm": 0.3642370726837966, + "learning_rate": 7.2681189711919896e-06, + "loss": 0.4669, + "step": 2993 + }, + { + "epoch": 2.0762829403606102, + "grad_norm": 0.4183134048378031, + "learning_rate": 7.265961388273785e-06, + "loss": 0.4387, + "step": 2994 + }, + { + "epoch": 2.076976421636616, + "grad_norm": 0.33477538317487254, + "learning_rate": 7.263803274230643e-06, + "loss": 0.3811, + "step": 2995 + }, + { + "epoch": 2.0776699029126213, + "grad_norm": 0.35281962859729005, + "learning_rate": 7.2616446295684075e-06, + "loss": 0.4388, + "step": 2996 + }, + { + "epoch": 2.078363384188627, + "grad_norm": 0.33350983930832584, + "learning_rate": 7.25948545479305e-06, + "loss": 0.4044, + "step": 2997 + }, + { + "epoch": 2.0790568654646324, + "grad_norm": 0.42407451571664223, + "learning_rate": 7.2573257504106665e-06, + "loss": 0.4759, + "step": 2998 + }, + { + "epoch": 2.079750346740638, + "grad_norm": 0.3510795526353381, + "learning_rate": 7.255165516927476e-06, + "loss": 0.4362, + "step": 2999 + }, + { + "epoch": 2.0804438280166435, + "grad_norm": 0.3317839288446993, + "learning_rate": 7.2530047548498205e-06, + "loss": 0.4157, + "step": 3000 + }, + { + "epoch": 2.0811373092926493, + "grad_norm": 0.3629897746754837, + "learning_rate": 7.2508434646841665e-06, + "loss": 0.4377, + "step": 3001 + }, + { + "epoch": 2.0818307905686546, + "grad_norm": 0.34710015516895404, + "learning_rate": 7.248681646937106e-06, + "loss": 0.4182, + "step": 3002 + }, + { + "epoch": 2.0825242718446604, + "grad_norm": 0.37606986087672556, + "learning_rate": 7.246519302115355e-06, + "loss": 0.4569, + "step": 3003 + }, + { + "epoch": 2.0832177531206657, + "grad_norm": 0.44421841377060706, + "learning_rate": 7.244356430725748e-06, + "loss": 0.4541, + "step": 3004 + }, + { + "epoch": 2.0839112343966715, + "grad_norm": 0.34865634699191783, + "learning_rate": 7.242193033275249e-06, + "loss": 0.4422, + "step": 3005 + }, + { + "epoch": 2.084604715672677, + "grad_norm": 0.3865292049377577, + "learning_rate": 7.24002911027094e-06, + "loss": 0.4824, + "step": 3006 + }, + { + "epoch": 2.0852981969486826, + "grad_norm": 0.4306864280596324, + "learning_rate": 7.237864662220032e-06, + "loss": 0.4513, + "step": 3007 + }, + { + "epoch": 2.085991678224688, + "grad_norm": 0.35316900899834214, + "learning_rate": 7.235699689629855e-06, + "loss": 0.4556, + "step": 3008 + }, + { + "epoch": 2.0866851595006937, + "grad_norm": 0.46068476114365564, + "learning_rate": 7.2335341930078614e-06, + "loss": 0.423, + "step": 3009 + }, + { + "epoch": 2.087378640776699, + "grad_norm": 0.3652607083383284, + "learning_rate": 7.23136817286163e-06, + "loss": 0.4481, + "step": 3010 + }, + { + "epoch": 2.0880721220527048, + "grad_norm": 0.34670932988684466, + "learning_rate": 7.229201629698857e-06, + "loss": 0.4677, + "step": 3011 + }, + { + "epoch": 2.08876560332871, + "grad_norm": 0.346006830243124, + "learning_rate": 7.22703456402737e-06, + "loss": 0.3987, + "step": 3012 + }, + { + "epoch": 2.089459084604716, + "grad_norm": 0.3699684146508989, + "learning_rate": 7.224866976355108e-06, + "loss": 0.3972, + "step": 3013 + }, + { + "epoch": 2.090152565880721, + "grad_norm": 0.36646039093412713, + "learning_rate": 7.22269886719014e-06, + "loss": 0.4152, + "step": 3014 + }, + { + "epoch": 2.090846047156727, + "grad_norm": 0.37413942838093034, + "learning_rate": 7.220530237040655e-06, + "loss": 0.4508, + "step": 3015 + }, + { + "epoch": 2.0915395284327323, + "grad_norm": 0.3934824522530865, + "learning_rate": 7.2183610864149655e-06, + "loss": 0.4506, + "step": 3016 + }, + { + "epoch": 2.092233009708738, + "grad_norm": 0.3404808850344954, + "learning_rate": 7.216191415821503e-06, + "loss": 0.4035, + "step": 3017 + }, + { + "epoch": 2.0929264909847434, + "grad_norm": 0.3326069502831703, + "learning_rate": 7.214021225768821e-06, + "loss": 0.41, + "step": 3018 + }, + { + "epoch": 2.093619972260749, + "grad_norm": 0.3519911319952195, + "learning_rate": 7.211850516765602e-06, + "loss": 0.4132, + "step": 3019 + }, + { + "epoch": 2.0943134535367545, + "grad_norm": 0.327221436188564, + "learning_rate": 7.209679289320638e-06, + "loss": 0.4113, + "step": 3020 + }, + { + "epoch": 2.0950069348127602, + "grad_norm": 0.36968738188853695, + "learning_rate": 7.2075075439428535e-06, + "loss": 0.438, + "step": 3021 + }, + { + "epoch": 2.0957004160887656, + "grad_norm": 0.37323195061462966, + "learning_rate": 7.205335281141287e-06, + "loss": 0.4161, + "step": 3022 + }, + { + "epoch": 2.0963938973647713, + "grad_norm": 0.3723826405101407, + "learning_rate": 7.203162501425103e-06, + "loss": 0.4414, + "step": 3023 + }, + { + "epoch": 2.0970873786407767, + "grad_norm": 0.5976546534809862, + "learning_rate": 7.200989205303583e-06, + "loss": 0.4725, + "step": 3024 + }, + { + "epoch": 2.0977808599167824, + "grad_norm": 0.394353759827912, + "learning_rate": 7.198815393286136e-06, + "loss": 0.456, + "step": 3025 + }, + { + "epoch": 2.0984743411927878, + "grad_norm": 0.42285231924559064, + "learning_rate": 7.196641065882285e-06, + "loss": 0.4505, + "step": 3026 + }, + { + "epoch": 2.0991678224687935, + "grad_norm": 0.44456228602603215, + "learning_rate": 7.1944662236016774e-06, + "loss": 0.4412, + "step": 3027 + }, + { + "epoch": 2.099861303744799, + "grad_norm": 0.35094056985690847, + "learning_rate": 7.192290866954078e-06, + "loss": 0.4403, + "step": 3028 + }, + { + "epoch": 2.1005547850208046, + "grad_norm": 0.3526100530140754, + "learning_rate": 7.190114996449378e-06, + "loss": 0.4057, + "step": 3029 + }, + { + "epoch": 2.10124826629681, + "grad_norm": 0.38574775919106535, + "learning_rate": 7.1879386125975836e-06, + "loss": 0.4768, + "step": 3030 + }, + { + "epoch": 2.1019417475728157, + "grad_norm": 0.3460237338522369, + "learning_rate": 7.185761715908826e-06, + "loss": 0.4319, + "step": 3031 + }, + { + "epoch": 2.102635228848821, + "grad_norm": 0.4088404125103431, + "learning_rate": 7.183584306893352e-06, + "loss": 0.4302, + "step": 3032 + }, + { + "epoch": 2.103328710124827, + "grad_norm": 0.3724657627204574, + "learning_rate": 7.181406386061529e-06, + "loss": 0.422, + "step": 3033 + }, + { + "epoch": 2.104022191400832, + "grad_norm": 0.39440336400991, + "learning_rate": 7.179227953923848e-06, + "loss": 0.4325, + "step": 3034 + }, + { + "epoch": 2.104715672676838, + "grad_norm": 0.4836264746543931, + "learning_rate": 7.177049010990917e-06, + "loss": 0.4715, + "step": 3035 + }, + { + "epoch": 2.1054091539528432, + "grad_norm": 0.34367887699703425, + "learning_rate": 7.174869557773467e-06, + "loss": 0.4061, + "step": 3036 + }, + { + "epoch": 2.106102635228849, + "grad_norm": 0.36229201910808145, + "learning_rate": 7.172689594782342e-06, + "loss": 0.4573, + "step": 3037 + }, + { + "epoch": 2.1067961165048543, + "grad_norm": 0.36976526386395087, + "learning_rate": 7.170509122528511e-06, + "loss": 0.4284, + "step": 3038 + }, + { + "epoch": 2.10748959778086, + "grad_norm": 0.4125742569348614, + "learning_rate": 7.168328141523062e-06, + "loss": 0.4333, + "step": 3039 + }, + { + "epoch": 2.1081830790568654, + "grad_norm": 0.34810355488800704, + "learning_rate": 7.1661466522772e-06, + "loss": 0.3828, + "step": 3040 + }, + { + "epoch": 2.108876560332871, + "grad_norm": 0.3790962333779141, + "learning_rate": 7.163964655302252e-06, + "loss": 0.5177, + "step": 3041 + }, + { + "epoch": 2.1095700416088765, + "grad_norm": 0.3665157730115209, + "learning_rate": 7.161782151109659e-06, + "loss": 0.4403, + "step": 3042 + }, + { + "epoch": 2.1102635228848823, + "grad_norm": 0.39432150364010454, + "learning_rate": 7.1595991402109865e-06, + "loss": 0.4574, + "step": 3043 + }, + { + "epoch": 2.1109570041608876, + "grad_norm": 0.40865901683254635, + "learning_rate": 7.157415623117917e-06, + "loss": 0.4795, + "step": 3044 + }, + { + "epoch": 2.1116504854368934, + "grad_norm": 0.3764617126662991, + "learning_rate": 7.15523160034225e-06, + "loss": 0.426, + "step": 3045 + }, + { + "epoch": 2.1123439667128987, + "grad_norm": 0.44733140346031974, + "learning_rate": 7.1530470723959045e-06, + "loss": 0.4029, + "step": 3046 + }, + { + "epoch": 2.1130374479889045, + "grad_norm": 0.371084336094575, + "learning_rate": 7.1508620397909175e-06, + "loss": 0.5242, + "step": 3047 + }, + { + "epoch": 2.11373092926491, + "grad_norm": 0.34374048081023245, + "learning_rate": 7.148676503039448e-06, + "loss": 0.42, + "step": 3048 + }, + { + "epoch": 2.1144244105409156, + "grad_norm": 0.3990430602739235, + "learning_rate": 7.146490462653767e-06, + "loss": 0.4535, + "step": 3049 + }, + { + "epoch": 2.115117891816921, + "grad_norm": 0.3780455875402749, + "learning_rate": 7.144303919146265e-06, + "loss": 0.479, + "step": 3050 + }, + { + "epoch": 2.1158113730929267, + "grad_norm": 0.36899951610153414, + "learning_rate": 7.142116873029455e-06, + "loss": 0.4002, + "step": 3051 + }, + { + "epoch": 2.116504854368932, + "grad_norm": 0.4093650961730342, + "learning_rate": 7.139929324815965e-06, + "loss": 0.4425, + "step": 3052 + }, + { + "epoch": 2.1171983356449378, + "grad_norm": 0.40942250431003674, + "learning_rate": 7.137741275018539e-06, + "loss": 0.4427, + "step": 3053 + }, + { + "epoch": 2.117891816920943, + "grad_norm": 0.3488974713103121, + "learning_rate": 7.135552724150041e-06, + "loss": 0.4239, + "step": 3054 + }, + { + "epoch": 2.118585298196949, + "grad_norm": 0.35935995772783713, + "learning_rate": 7.133363672723449e-06, + "loss": 0.3947, + "step": 3055 + }, + { + "epoch": 2.119278779472954, + "grad_norm": 0.35203242767318643, + "learning_rate": 7.131174121251864e-06, + "loss": 0.4452, + "step": 3056 + }, + { + "epoch": 2.11997226074896, + "grad_norm": 0.36801362436190616, + "learning_rate": 7.128984070248499e-06, + "loss": 0.4358, + "step": 3057 + }, + { + "epoch": 2.1206657420249653, + "grad_norm": 0.35498634891990455, + "learning_rate": 7.126793520226688e-06, + "loss": 0.3871, + "step": 3058 + }, + { + "epoch": 2.121359223300971, + "grad_norm": 0.3449513968787113, + "learning_rate": 7.124602471699878e-06, + "loss": 0.3842, + "step": 3059 + }, + { + "epoch": 2.1220527045769764, + "grad_norm": 0.3560759248219683, + "learning_rate": 7.1224109251816355e-06, + "loss": 0.3537, + "step": 3060 + }, + { + "epoch": 2.122746185852982, + "grad_norm": 0.3991859818076799, + "learning_rate": 7.120218881185644e-06, + "loss": 0.4124, + "step": 3061 + }, + { + "epoch": 2.1234396671289875, + "grad_norm": 0.3428981209932839, + "learning_rate": 7.118026340225701e-06, + "loss": 0.4028, + "step": 3062 + }, + { + "epoch": 2.1241331484049932, + "grad_norm": 0.3527433179290824, + "learning_rate": 7.115833302815724e-06, + "loss": 0.4375, + "step": 3063 + }, + { + "epoch": 2.1248266296809986, + "grad_norm": 0.4246884696511276, + "learning_rate": 7.113639769469744e-06, + "loss": 0.3882, + "step": 3064 + }, + { + "epoch": 2.1255201109570043, + "grad_norm": 0.35714650509889484, + "learning_rate": 7.11144574070191e-06, + "loss": 0.4215, + "step": 3065 + }, + { + "epoch": 2.1262135922330097, + "grad_norm": 0.34687010359136733, + "learning_rate": 7.109251217026487e-06, + "loss": 0.3993, + "step": 3066 + }, + { + "epoch": 2.1269070735090154, + "grad_norm": 0.33213104312134606, + "learning_rate": 7.1070561989578535e-06, + "loss": 0.3956, + "step": 3067 + }, + { + "epoch": 2.1276005547850207, + "grad_norm": 0.3612567948278647, + "learning_rate": 7.104860687010507e-06, + "loss": 0.4095, + "step": 3068 + }, + { + "epoch": 2.1282940360610265, + "grad_norm": 0.3743454949185115, + "learning_rate": 7.1026646816990596e-06, + "loss": 0.4732, + "step": 3069 + }, + { + "epoch": 2.128987517337032, + "grad_norm": 0.36277235569194677, + "learning_rate": 7.100468183538241e-06, + "loss": 0.3942, + "step": 3070 + }, + { + "epoch": 2.1296809986130376, + "grad_norm": 0.42424603861477894, + "learning_rate": 7.098271193042889e-06, + "loss": 0.4516, + "step": 3071 + }, + { + "epoch": 2.130374479889043, + "grad_norm": 0.3738954919270784, + "learning_rate": 7.096073710727968e-06, + "loss": 0.4001, + "step": 3072 + }, + { + "epoch": 2.1310679611650487, + "grad_norm": 0.3676848335117262, + "learning_rate": 7.0938757371085485e-06, + "loss": 0.3992, + "step": 3073 + }, + { + "epoch": 2.131761442441054, + "grad_norm": 0.3394703406308582, + "learning_rate": 7.091677272699823e-06, + "loss": 0.4266, + "step": 3074 + }, + { + "epoch": 2.13245492371706, + "grad_norm": 0.4136822507153506, + "learning_rate": 7.089478318017091e-06, + "loss": 0.4704, + "step": 3075 + }, + { + "epoch": 2.133148404993065, + "grad_norm": 0.36330960243837956, + "learning_rate": 7.0872788735757755e-06, + "loss": 0.443, + "step": 3076 + }, + { + "epoch": 2.133841886269071, + "grad_norm": 0.36941866023569536, + "learning_rate": 7.085078939891409e-06, + "loss": 0.4165, + "step": 3077 + }, + { + "epoch": 2.1345353675450762, + "grad_norm": 0.3435100398963316, + "learning_rate": 7.082878517479639e-06, + "loss": 0.421, + "step": 3078 + }, + { + "epoch": 2.135228848821082, + "grad_norm": 0.3762597369490691, + "learning_rate": 7.08067760685623e-06, + "loss": 0.4293, + "step": 3079 + }, + { + "epoch": 2.1359223300970873, + "grad_norm": 0.38771522549082293, + "learning_rate": 7.078476208537057e-06, + "loss": 0.4548, + "step": 3080 + }, + { + "epoch": 2.136615811373093, + "grad_norm": 0.47538159903011057, + "learning_rate": 7.076274323038117e-06, + "loss": 0.3915, + "step": 3081 + }, + { + "epoch": 2.1373092926490984, + "grad_norm": 0.40870763751556194, + "learning_rate": 7.074071950875509e-06, + "loss": 0.4298, + "step": 3082 + }, + { + "epoch": 2.138002773925104, + "grad_norm": 0.43511553462940733, + "learning_rate": 7.07186909256546e-06, + "loss": 0.4606, + "step": 3083 + }, + { + "epoch": 2.1386962552011095, + "grad_norm": 0.33652915412924644, + "learning_rate": 7.069665748624299e-06, + "loss": 0.4129, + "step": 3084 + }, + { + "epoch": 2.1393897364771153, + "grad_norm": 0.3931416637998521, + "learning_rate": 7.067461919568477e-06, + "loss": 0.4228, + "step": 3085 + }, + { + "epoch": 2.1400832177531206, + "grad_norm": 0.41083223367930966, + "learning_rate": 7.065257605914555e-06, + "loss": 0.4713, + "step": 3086 + }, + { + "epoch": 2.1407766990291264, + "grad_norm": 0.8211237999837143, + "learning_rate": 7.063052808179205e-06, + "loss": 0.4443, + "step": 3087 + }, + { + "epoch": 2.1414701803051317, + "grad_norm": 0.38815529192990306, + "learning_rate": 7.0608475268792186e-06, + "loss": 0.4617, + "step": 3088 + }, + { + "epoch": 2.1421636615811375, + "grad_norm": 0.348726077841462, + "learning_rate": 7.0586417625315e-06, + "loss": 0.4333, + "step": 3089 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.3715224852636519, + "learning_rate": 7.056435515653059e-06, + "loss": 0.4456, + "step": 3090 + }, + { + "epoch": 2.1435506241331486, + "grad_norm": 0.384158212156197, + "learning_rate": 7.054228786761027e-06, + "loss": 0.4033, + "step": 3091 + }, + { + "epoch": 2.144244105409154, + "grad_norm": 0.374678479276744, + "learning_rate": 7.0520215763726444e-06, + "loss": 0.4377, + "step": 3092 + }, + { + "epoch": 2.1449375866851597, + "grad_norm": 0.394246413472569, + "learning_rate": 7.049813885005267e-06, + "loss": 0.4191, + "step": 3093 + }, + { + "epoch": 2.145631067961165, + "grad_norm": 0.4953657444002148, + "learning_rate": 7.04760571317636e-06, + "loss": 0.4662, + "step": 3094 + }, + { + "epoch": 2.1463245492371708, + "grad_norm": 0.35962145722718386, + "learning_rate": 7.0453970614035025e-06, + "loss": 0.428, + "step": 3095 + }, + { + "epoch": 2.147018030513176, + "grad_norm": 0.340209505876518, + "learning_rate": 7.043187930204387e-06, + "loss": 0.4125, + "step": 3096 + }, + { + "epoch": 2.147711511789182, + "grad_norm": 0.3697826337989141, + "learning_rate": 7.040978320096819e-06, + "loss": 0.4005, + "step": 3097 + }, + { + "epoch": 2.148404993065187, + "grad_norm": 0.35727219624826106, + "learning_rate": 7.038768231598715e-06, + "loss": 0.4242, + "step": 3098 + }, + { + "epoch": 2.149098474341193, + "grad_norm": 0.47038992732935203, + "learning_rate": 7.036557665228103e-06, + "loss": 0.4336, + "step": 3099 + }, + { + "epoch": 2.1497919556171983, + "grad_norm": 0.33951178416801997, + "learning_rate": 7.034346621503121e-06, + "loss": 0.446, + "step": 3100 + }, + { + "epoch": 2.150485436893204, + "grad_norm": 0.39454754315666807, + "learning_rate": 7.032135100942027e-06, + "loss": 0.3949, + "step": 3101 + }, + { + "epoch": 2.1511789181692094, + "grad_norm": 0.3608277708887644, + "learning_rate": 7.029923104063182e-06, + "loss": 0.4358, + "step": 3102 + }, + { + "epoch": 2.151872399445215, + "grad_norm": 0.363761875264272, + "learning_rate": 7.027710631385063e-06, + "loss": 0.412, + "step": 3103 + }, + { + "epoch": 2.1525658807212205, + "grad_norm": 0.3800460349549785, + "learning_rate": 7.025497683426257e-06, + "loss": 0.4185, + "step": 3104 + }, + { + "epoch": 2.1532593619972262, + "grad_norm": 0.329733700696235, + "learning_rate": 7.023284260705463e-06, + "loss": 0.3696, + "step": 3105 + }, + { + "epoch": 2.1539528432732316, + "grad_norm": 0.3743548356607152, + "learning_rate": 7.021070363741492e-06, + "loss": 0.407, + "step": 3106 + }, + { + "epoch": 2.1546463245492373, + "grad_norm": 0.4253620493594505, + "learning_rate": 7.018855993053266e-06, + "loss": 0.4243, + "step": 3107 + }, + { + "epoch": 2.1553398058252426, + "grad_norm": 0.3729463446838696, + "learning_rate": 7.016641149159816e-06, + "loss": 0.4135, + "step": 3108 + }, + { + "epoch": 2.1560332871012484, + "grad_norm": 0.37350361298635437, + "learning_rate": 7.0144258325802835e-06, + "loss": 0.3981, + "step": 3109 + }, + { + "epoch": 2.1567267683772537, + "grad_norm": 0.35199874919180735, + "learning_rate": 7.012210043833927e-06, + "loss": 0.446, + "step": 3110 + }, + { + "epoch": 2.1574202496532595, + "grad_norm": 0.3669594222490185, + "learning_rate": 7.00999378344011e-06, + "loss": 0.4265, + "step": 3111 + }, + { + "epoch": 2.158113730929265, + "grad_norm": 0.3508520756570443, + "learning_rate": 7.007777051918306e-06, + "loss": 0.4096, + "step": 3112 + }, + { + "epoch": 2.1588072122052706, + "grad_norm": 0.35348859791564874, + "learning_rate": 7.005559849788101e-06, + "loss": 0.3865, + "step": 3113 + }, + { + "epoch": 2.159500693481276, + "grad_norm": 0.3655679514995117, + "learning_rate": 7.003342177569195e-06, + "loss": 0.4527, + "step": 3114 + }, + { + "epoch": 2.1601941747572817, + "grad_norm": 0.375376852563299, + "learning_rate": 7.00112403578139e-06, + "loss": 0.466, + "step": 3115 + }, + { + "epoch": 2.160887656033287, + "grad_norm": 0.42216667649516487, + "learning_rate": 6.998905424944605e-06, + "loss": 0.3593, + "step": 3116 + }, + { + "epoch": 2.161581137309293, + "grad_norm": 0.3662857201076887, + "learning_rate": 6.996686345578863e-06, + "loss": 0.4118, + "step": 3117 + }, + { + "epoch": 2.162274618585298, + "grad_norm": 0.5380173495766039, + "learning_rate": 6.994466798204303e-06, + "loss": 0.3992, + "step": 3118 + }, + { + "epoch": 2.162968099861304, + "grad_norm": 0.3447286846349849, + "learning_rate": 6.992246783341171e-06, + "loss": 0.3635, + "step": 3119 + }, + { + "epoch": 2.163661581137309, + "grad_norm": 0.4352242220473789, + "learning_rate": 6.99002630150982e-06, + "loss": 0.4518, + "step": 3120 + }, + { + "epoch": 2.164355062413315, + "grad_norm": 0.3729191189855568, + "learning_rate": 6.987805353230719e-06, + "loss": 0.4443, + "step": 3121 + }, + { + "epoch": 2.1650485436893203, + "grad_norm": 0.42897560953798314, + "learning_rate": 6.985583939024436e-06, + "loss": 0.4223, + "step": 3122 + }, + { + "epoch": 2.165742024965326, + "grad_norm": 0.3828341124882771, + "learning_rate": 6.983362059411661e-06, + "loss": 0.4533, + "step": 3123 + }, + { + "epoch": 2.1664355062413314, + "grad_norm": 0.5212452572134227, + "learning_rate": 6.9811397149131835e-06, + "loss": 0.449, + "step": 3124 + }, + { + "epoch": 2.167128987517337, + "grad_norm": 0.3735744987624875, + "learning_rate": 6.978916906049903e-06, + "loss": 0.3937, + "step": 3125 + }, + { + "epoch": 2.1678224687933425, + "grad_norm": 0.4093629777835829, + "learning_rate": 6.976693633342833e-06, + "loss": 0.3823, + "step": 3126 + }, + { + "epoch": 2.1685159500693483, + "grad_norm": 0.3645001820211574, + "learning_rate": 6.97446989731309e-06, + "loss": 0.3985, + "step": 3127 + }, + { + "epoch": 2.1692094313453536, + "grad_norm": 0.32968239868725185, + "learning_rate": 6.972245698481903e-06, + "loss": 0.3925, + "step": 3128 + }, + { + "epoch": 2.1699029126213594, + "grad_norm": 0.4065184879704676, + "learning_rate": 6.970021037370609e-06, + "loss": 0.4927, + "step": 3129 + }, + { + "epoch": 2.1705963938973647, + "grad_norm": 0.3603595652767137, + "learning_rate": 6.967795914500651e-06, + "loss": 0.4516, + "step": 3130 + }, + { + "epoch": 2.1712898751733705, + "grad_norm": 0.4198167248069969, + "learning_rate": 6.965570330393582e-06, + "loss": 0.4499, + "step": 3131 + }, + { + "epoch": 2.171983356449376, + "grad_norm": 0.39279017404085115, + "learning_rate": 6.963344285571063e-06, + "loss": 0.3915, + "step": 3132 + }, + { + "epoch": 2.1726768377253816, + "grad_norm": 0.3635314062003914, + "learning_rate": 6.961117780554862e-06, + "loss": 0.445, + "step": 3133 + }, + { + "epoch": 2.173370319001387, + "grad_norm": 0.3832597576776832, + "learning_rate": 6.958890815866857e-06, + "loss": 0.4097, + "step": 3134 + }, + { + "epoch": 2.1740638002773927, + "grad_norm": 0.36993096949813076, + "learning_rate": 6.956663392029033e-06, + "loss": 0.4345, + "step": 3135 + }, + { + "epoch": 2.174757281553398, + "grad_norm": 0.3733518675027611, + "learning_rate": 6.9544355095634775e-06, + "loss": 0.396, + "step": 3136 + }, + { + "epoch": 2.1754507628294038, + "grad_norm": 0.35577485861920793, + "learning_rate": 6.9522071689923955e-06, + "loss": 0.4446, + "step": 3137 + }, + { + "epoch": 2.176144244105409, + "grad_norm": 0.345487235319404, + "learning_rate": 6.9499783708380904e-06, + "loss": 0.3872, + "step": 3138 + }, + { + "epoch": 2.176837725381415, + "grad_norm": 0.36688561634303574, + "learning_rate": 6.947749115622979e-06, + "loss": 0.4859, + "step": 3139 + }, + { + "epoch": 2.17753120665742, + "grad_norm": 0.35631342115427483, + "learning_rate": 6.945519403869581e-06, + "loss": 0.4116, + "step": 3140 + }, + { + "epoch": 2.178224687933426, + "grad_norm": 0.39777269551571115, + "learning_rate": 6.943289236100523e-06, + "loss": 0.4601, + "step": 3141 + }, + { + "epoch": 2.1789181692094313, + "grad_norm": 0.4227153104614663, + "learning_rate": 6.941058612838544e-06, + "loss": 0.4305, + "step": 3142 + }, + { + "epoch": 2.179611650485437, + "grad_norm": 0.34143561557334956, + "learning_rate": 6.938827534606484e-06, + "loss": 0.3897, + "step": 3143 + }, + { + "epoch": 2.1803051317614424, + "grad_norm": 0.39993084472906265, + "learning_rate": 6.936596001927292e-06, + "loss": 0.406, + "step": 3144 + }, + { + "epoch": 2.180998613037448, + "grad_norm": 0.35637437963580954, + "learning_rate": 6.93436401532402e-06, + "loss": 0.375, + "step": 3145 + }, + { + "epoch": 2.1816920943134535, + "grad_norm": 0.3766557843328465, + "learning_rate": 6.932131575319834e-06, + "loss": 0.3975, + "step": 3146 + }, + { + "epoch": 2.1823855755894592, + "grad_norm": 0.42950999063047757, + "learning_rate": 6.929898682437999e-06, + "loss": 0.419, + "step": 3147 + }, + { + "epoch": 2.1830790568654646, + "grad_norm": 0.49033590261343324, + "learning_rate": 6.927665337201891e-06, + "loss": 0.4568, + "step": 3148 + }, + { + "epoch": 2.1837725381414703, + "grad_norm": 0.38923798848248387, + "learning_rate": 6.925431540134988e-06, + "loss": 0.4744, + "step": 3149 + }, + { + "epoch": 2.1844660194174756, + "grad_norm": 0.37569690836507236, + "learning_rate": 6.923197291760876e-06, + "loss": 0.3993, + "step": 3150 + }, + { + "epoch": 2.1851595006934814, + "grad_norm": 0.3872062746291295, + "learning_rate": 6.9209625926032485e-06, + "loss": 0.4952, + "step": 3151 + }, + { + "epoch": 2.1858529819694867, + "grad_norm": 0.3755709994284284, + "learning_rate": 6.918727443185902e-06, + "loss": 0.438, + "step": 3152 + }, + { + "epoch": 2.1865464632454925, + "grad_norm": 0.37274394488620805, + "learning_rate": 6.916491844032736e-06, + "loss": 0.4165, + "step": 3153 + }, + { + "epoch": 2.187239944521498, + "grad_norm": 0.3671859855245455, + "learning_rate": 6.914255795667763e-06, + "loss": 0.467, + "step": 3154 + }, + { + "epoch": 2.1879334257975036, + "grad_norm": 0.34563104164031566, + "learning_rate": 6.912019298615097e-06, + "loss": 0.4284, + "step": 3155 + }, + { + "epoch": 2.188626907073509, + "grad_norm": 0.36065990000831494, + "learning_rate": 6.909782353398955e-06, + "loss": 0.4364, + "step": 3156 + }, + { + "epoch": 2.1893203883495147, + "grad_norm": 0.42302861767406164, + "learning_rate": 6.907544960543659e-06, + "loss": 0.4005, + "step": 3157 + }, + { + "epoch": 2.19001386962552, + "grad_norm": 0.4016607283196581, + "learning_rate": 6.905307120573639e-06, + "loss": 0.3988, + "step": 3158 + }, + { + "epoch": 2.190707350901526, + "grad_norm": 0.39093334562382, + "learning_rate": 6.903068834013429e-06, + "loss": 0.4261, + "step": 3159 + }, + { + "epoch": 2.191400832177531, + "grad_norm": 0.36793766843058306, + "learning_rate": 6.900830101387667e-06, + "loss": 0.4016, + "step": 3160 + }, + { + "epoch": 2.192094313453537, + "grad_norm": 0.3243424542807762, + "learning_rate": 6.8985909232210965e-06, + "loss": 0.3572, + "step": 3161 + }, + { + "epoch": 2.192787794729542, + "grad_norm": 0.3775776131699167, + "learning_rate": 6.896351300038564e-06, + "loss": 0.4612, + "step": 3162 + }, + { + "epoch": 2.193481276005548, + "grad_norm": 0.3818596641017086, + "learning_rate": 6.89411123236502e-06, + "loss": 0.4939, + "step": 3163 + }, + { + "epoch": 2.1941747572815533, + "grad_norm": 0.37812123779394696, + "learning_rate": 6.891870720725522e-06, + "loss": 0.414, + "step": 3164 + }, + { + "epoch": 2.194868238557559, + "grad_norm": 0.3835042224657585, + "learning_rate": 6.8896297656452286e-06, + "loss": 0.4332, + "step": 3165 + }, + { + "epoch": 2.1955617198335644, + "grad_norm": 0.34305892206165406, + "learning_rate": 6.887388367649402e-06, + "loss": 0.3728, + "step": 3166 + }, + { + "epoch": 2.19625520110957, + "grad_norm": 0.3799511406233889, + "learning_rate": 6.885146527263411e-06, + "loss": 0.4177, + "step": 3167 + }, + { + "epoch": 2.1969486823855755, + "grad_norm": 0.36465839462895966, + "learning_rate": 6.882904245012728e-06, + "loss": 0.3953, + "step": 3168 + }, + { + "epoch": 2.1976421636615813, + "grad_norm": 0.36502262785244294, + "learning_rate": 6.8806615214229275e-06, + "loss": 0.3942, + "step": 3169 + }, + { + "epoch": 2.1983356449375866, + "grad_norm": 0.44813633262880787, + "learning_rate": 6.878418357019685e-06, + "loss": 0.4682, + "step": 3170 + }, + { + "epoch": 2.1990291262135924, + "grad_norm": 1.1150107948643713, + "learning_rate": 6.8761747523287845e-06, + "loss": 0.4878, + "step": 3171 + }, + { + "epoch": 2.1997226074895977, + "grad_norm": 0.37817388216565617, + "learning_rate": 6.87393070787611e-06, + "loss": 0.4528, + "step": 3172 + }, + { + "epoch": 2.2004160887656035, + "grad_norm": 0.3481644514461239, + "learning_rate": 6.871686224187649e-06, + "loss": 0.3869, + "step": 3173 + }, + { + "epoch": 2.201109570041609, + "grad_norm": 0.6472600110428178, + "learning_rate": 6.869441301789492e-06, + "loss": 0.4672, + "step": 3174 + }, + { + "epoch": 2.2018030513176146, + "grad_norm": 0.396462841976969, + "learning_rate": 6.867195941207834e-06, + "loss": 0.5001, + "step": 3175 + }, + { + "epoch": 2.20249653259362, + "grad_norm": 0.33504670662848984, + "learning_rate": 6.864950142968969e-06, + "loss": 0.4202, + "step": 3176 + }, + { + "epoch": 2.2031900138696257, + "grad_norm": 0.48692526877310366, + "learning_rate": 6.862703907599298e-06, + "loss": 0.4479, + "step": 3177 + }, + { + "epoch": 2.203883495145631, + "grad_norm": 0.36751944278978205, + "learning_rate": 6.860457235625322e-06, + "loss": 0.4227, + "step": 3178 + }, + { + "epoch": 2.2045769764216367, + "grad_norm": 0.4051893471548496, + "learning_rate": 6.8582101275736436e-06, + "loss": 0.4523, + "step": 3179 + }, + { + "epoch": 2.205270457697642, + "grad_norm": 0.3556882051436896, + "learning_rate": 6.855962583970969e-06, + "loss": 0.4396, + "step": 3180 + }, + { + "epoch": 2.205963938973648, + "grad_norm": 0.37866140201299847, + "learning_rate": 6.853714605344105e-06, + "loss": 0.4349, + "step": 3181 + }, + { + "epoch": 2.206657420249653, + "grad_norm": 0.3734076208223684, + "learning_rate": 6.851466192219963e-06, + "loss": 0.382, + "step": 3182 + }, + { + "epoch": 2.207350901525659, + "grad_norm": 0.3435981580517485, + "learning_rate": 6.849217345125556e-06, + "loss": 0.4263, + "step": 3183 + }, + { + "epoch": 2.2080443828016643, + "grad_norm": 0.43374269513443797, + "learning_rate": 6.846968064587995e-06, + "loss": 0.4193, + "step": 3184 + }, + { + "epoch": 2.20873786407767, + "grad_norm": 0.42876485085711014, + "learning_rate": 6.844718351134496e-06, + "loss": 0.4792, + "step": 3185 + }, + { + "epoch": 2.2094313453536754, + "grad_norm": 0.3567072347550561, + "learning_rate": 6.842468205292375e-06, + "loss": 0.4143, + "step": 3186 + }, + { + "epoch": 2.210124826629681, + "grad_norm": 0.37299923097904636, + "learning_rate": 6.840217627589052e-06, + "loss": 0.4521, + "step": 3187 + }, + { + "epoch": 2.2108183079056865, + "grad_norm": 0.34832336457669794, + "learning_rate": 6.837966618552045e-06, + "loss": 0.4336, + "step": 3188 + }, + { + "epoch": 2.2115117891816922, + "grad_norm": 0.3679922938183328, + "learning_rate": 6.835715178708973e-06, + "loss": 0.4121, + "step": 3189 + }, + { + "epoch": 2.2122052704576975, + "grad_norm": 0.3867529363489136, + "learning_rate": 6.8334633085875564e-06, + "loss": 0.4892, + "step": 3190 + }, + { + "epoch": 2.2128987517337033, + "grad_norm": 0.4172097099888967, + "learning_rate": 6.831211008715619e-06, + "loss": 0.4199, + "step": 3191 + }, + { + "epoch": 2.2135922330097086, + "grad_norm": 0.3484205019547079, + "learning_rate": 6.828958279621085e-06, + "loss": 0.4068, + "step": 3192 + }, + { + "epoch": 2.2142857142857144, + "grad_norm": 0.37711309043688035, + "learning_rate": 6.8267051218319766e-06, + "loss": 0.4407, + "step": 3193 + }, + { + "epoch": 2.2149791955617197, + "grad_norm": 0.38901539219632253, + "learning_rate": 6.824451535876415e-06, + "loss": 0.4645, + "step": 3194 + }, + { + "epoch": 2.2156726768377255, + "grad_norm": 0.3797025186452667, + "learning_rate": 6.8221975222826276e-06, + "loss": 0.4624, + "step": 3195 + }, + { + "epoch": 2.216366158113731, + "grad_norm": 0.38908439812531065, + "learning_rate": 6.819943081578939e-06, + "loss": 0.4008, + "step": 3196 + }, + { + "epoch": 2.2170596393897366, + "grad_norm": 0.33759666688484663, + "learning_rate": 6.817688214293773e-06, + "loss": 0.3768, + "step": 3197 + }, + { + "epoch": 2.217753120665742, + "grad_norm": 0.36114655983941857, + "learning_rate": 6.815432920955652e-06, + "loss": 0.4129, + "step": 3198 + }, + { + "epoch": 2.2184466019417477, + "grad_norm": 0.37733284468929934, + "learning_rate": 6.813177202093203e-06, + "loss": 0.4333, + "step": 3199 + }, + { + "epoch": 2.219140083217753, + "grad_norm": 0.3751447894147838, + "learning_rate": 6.81092105823515e-06, + "loss": 0.4139, + "step": 3200 + }, + { + "epoch": 2.219833564493759, + "grad_norm": 0.39087858551090465, + "learning_rate": 6.808664489910317e-06, + "loss": 0.4604, + "step": 3201 + }, + { + "epoch": 2.220527045769764, + "grad_norm": 0.38706165970167433, + "learning_rate": 6.806407497647625e-06, + "loss": 0.4602, + "step": 3202 + }, + { + "epoch": 2.22122052704577, + "grad_norm": 0.3512169622422497, + "learning_rate": 6.8041500819760976e-06, + "loss": 0.3964, + "step": 3203 + }, + { + "epoch": 2.221914008321775, + "grad_norm": 0.3898499803767229, + "learning_rate": 6.801892243424859e-06, + "loss": 0.467, + "step": 3204 + }, + { + "epoch": 2.222607489597781, + "grad_norm": 0.3439941460324588, + "learning_rate": 6.799633982523128e-06, + "loss": 0.4175, + "step": 3205 + }, + { + "epoch": 2.2233009708737863, + "grad_norm": 0.38551815405469103, + "learning_rate": 6.797375299800224e-06, + "loss": 0.3867, + "step": 3206 + }, + { + "epoch": 2.223994452149792, + "grad_norm": 0.35032279732716465, + "learning_rate": 6.795116195785567e-06, + "loss": 0.4235, + "step": 3207 + }, + { + "epoch": 2.2246879334257974, + "grad_norm": 0.36497372750026075, + "learning_rate": 6.792856671008676e-06, + "loss": 0.4484, + "step": 3208 + }, + { + "epoch": 2.225381414701803, + "grad_norm": 0.3480387233277934, + "learning_rate": 6.790596725999166e-06, + "loss": 0.4292, + "step": 3209 + }, + { + "epoch": 2.2260748959778085, + "grad_norm": 0.33245022179722605, + "learning_rate": 6.788336361286751e-06, + "loss": 0.4586, + "step": 3210 + }, + { + "epoch": 2.2267683772538143, + "grad_norm": 0.3679765680882823, + "learning_rate": 6.786075577401243e-06, + "loss": 0.4184, + "step": 3211 + }, + { + "epoch": 2.2274618585298196, + "grad_norm": 0.3829055470881608, + "learning_rate": 6.7838143748725574e-06, + "loss": 0.4545, + "step": 3212 + }, + { + "epoch": 2.2281553398058254, + "grad_norm": 0.3952697984732193, + "learning_rate": 6.7815527542307e-06, + "loss": 0.4426, + "step": 3213 + }, + { + "epoch": 2.2288488210818307, + "grad_norm": 0.365894439012572, + "learning_rate": 6.7792907160057796e-06, + "loss": 0.49, + "step": 3214 + }, + { + "epoch": 2.2295423023578365, + "grad_norm": 0.39561522589371145, + "learning_rate": 6.777028260728002e-06, + "loss": 0.4203, + "step": 3215 + }, + { + "epoch": 2.230235783633842, + "grad_norm": 0.4230834702410087, + "learning_rate": 6.774765388927669e-06, + "loss": 0.4901, + "step": 3216 + }, + { + "epoch": 2.2309292649098476, + "grad_norm": 0.3983835450531227, + "learning_rate": 6.772502101135183e-06, + "loss": 0.4546, + "step": 3217 + }, + { + "epoch": 2.231622746185853, + "grad_norm": 0.39458703276296103, + "learning_rate": 6.7702383978810424e-06, + "loss": 0.4164, + "step": 3218 + }, + { + "epoch": 2.2323162274618586, + "grad_norm": 0.3532660353792984, + "learning_rate": 6.767974279695842e-06, + "loss": 0.4211, + "step": 3219 + }, + { + "epoch": 2.233009708737864, + "grad_norm": 0.35081341241152886, + "learning_rate": 6.765709747110274e-06, + "loss": 0.3935, + "step": 3220 + }, + { + "epoch": 2.2337031900138697, + "grad_norm": 0.36711314575138754, + "learning_rate": 6.763444800655128e-06, + "loss": 0.4287, + "step": 3221 + }, + { + "epoch": 2.234396671289875, + "grad_norm": 0.3893473906612097, + "learning_rate": 6.761179440861294e-06, + "loss": 0.4565, + "step": 3222 + }, + { + "epoch": 2.235090152565881, + "grad_norm": 0.33315561721493103, + "learning_rate": 6.758913668259753e-06, + "loss": 0.4101, + "step": 3223 + }, + { + "epoch": 2.235783633841886, + "grad_norm": 0.3735604377255463, + "learning_rate": 6.756647483381588e-06, + "loss": 0.4334, + "step": 3224 + }, + { + "epoch": 2.236477115117892, + "grad_norm": 0.35767189078117967, + "learning_rate": 6.754380886757973e-06, + "loss": 0.4336, + "step": 3225 + }, + { + "epoch": 2.2371705963938973, + "grad_norm": 0.42486391288728126, + "learning_rate": 6.752113878920186e-06, + "loss": 0.3996, + "step": 3226 + }, + { + "epoch": 2.237864077669903, + "grad_norm": 0.3689275305319877, + "learning_rate": 6.749846460399594e-06, + "loss": 0.3904, + "step": 3227 + }, + { + "epoch": 2.2385575589459084, + "grad_norm": 0.45038343291428184, + "learning_rate": 6.747578631727666e-06, + "loss": 0.3958, + "step": 3228 + }, + { + "epoch": 2.239251040221914, + "grad_norm": 0.3553606619890095, + "learning_rate": 6.745310393435962e-06, + "loss": 0.4125, + "step": 3229 + }, + { + "epoch": 2.2399445214979194, + "grad_norm": 0.4021476653867645, + "learning_rate": 6.743041746056142e-06, + "loss": 0.4599, + "step": 3230 + }, + { + "epoch": 2.240638002773925, + "grad_norm": 0.3509861883633808, + "learning_rate": 6.740772690119961e-06, + "loss": 0.4337, + "step": 3231 + }, + { + "epoch": 2.2413314840499305, + "grad_norm": 0.35051440733846, + "learning_rate": 6.738503226159269e-06, + "loss": 0.4098, + "step": 3232 + }, + { + "epoch": 2.2420249653259363, + "grad_norm": 0.3889774976379475, + "learning_rate": 6.736233354706011e-06, + "loss": 0.4487, + "step": 3233 + }, + { + "epoch": 2.2427184466019416, + "grad_norm": 0.3810766233493372, + "learning_rate": 6.7339630762922295e-06, + "loss": 0.4673, + "step": 3234 + }, + { + "epoch": 2.2434119278779474, + "grad_norm": 0.34061956942943267, + "learning_rate": 6.731692391450061e-06, + "loss": 0.4228, + "step": 3235 + }, + { + "epoch": 2.2441054091539527, + "grad_norm": 0.34316598103004414, + "learning_rate": 6.729421300711736e-06, + "loss": 0.3806, + "step": 3236 + }, + { + "epoch": 2.2447988904299585, + "grad_norm": 0.43164398669665005, + "learning_rate": 6.727149804609585e-06, + "loss": 0.4451, + "step": 3237 + }, + { + "epoch": 2.245492371705964, + "grad_norm": 0.35099462456805025, + "learning_rate": 6.724877903676028e-06, + "loss": 0.4492, + "step": 3238 + }, + { + "epoch": 2.2461858529819696, + "grad_norm": 0.3996144361019322, + "learning_rate": 6.722605598443581e-06, + "loss": 0.4268, + "step": 3239 + }, + { + "epoch": 2.246879334257975, + "grad_norm": 0.38121833561607527, + "learning_rate": 6.720332889444858e-06, + "loss": 0.4651, + "step": 3240 + }, + { + "epoch": 2.2475728155339807, + "grad_norm": 0.33121543000194664, + "learning_rate": 6.7180597772125665e-06, + "loss": 0.4076, + "step": 3241 + }, + { + "epoch": 2.248266296809986, + "grad_norm": 0.35819296532384437, + "learning_rate": 6.7157862622795044e-06, + "loss": 0.4598, + "step": 3242 + }, + { + "epoch": 2.248959778085992, + "grad_norm": 0.36257211594140176, + "learning_rate": 6.71351234517857e-06, + "loss": 0.4323, + "step": 3243 + }, + { + "epoch": 2.249653259361997, + "grad_norm": 0.4290610247647165, + "learning_rate": 6.71123802644275e-06, + "loss": 0.3887, + "step": 3244 + }, + { + "epoch": 2.250346740638003, + "grad_norm": 0.34849749579416356, + "learning_rate": 6.7089633066051315e-06, + "loss": 0.4401, + "step": 3245 + }, + { + "epoch": 2.251040221914008, + "grad_norm": 0.3576875889894574, + "learning_rate": 6.706688186198891e-06, + "loss": 0.4177, + "step": 3246 + }, + { + "epoch": 2.251733703190014, + "grad_norm": 0.3553490004661506, + "learning_rate": 6.7044126657572985e-06, + "loss": 0.4276, + "step": 3247 + }, + { + "epoch": 2.2524271844660193, + "grad_norm": 0.3371123295813205, + "learning_rate": 6.702136745813721e-06, + "loss": 0.3941, + "step": 3248 + }, + { + "epoch": 2.253120665742025, + "grad_norm": 0.33522352253075743, + "learning_rate": 6.69986042690162e-06, + "loss": 0.4505, + "step": 3249 + }, + { + "epoch": 2.2538141470180304, + "grad_norm": 0.3566783443007843, + "learning_rate": 6.697583709554545e-06, + "loss": 0.4071, + "step": 3250 + }, + { + "epoch": 2.254507628294036, + "grad_norm": 0.34829843827033774, + "learning_rate": 6.695306594306142e-06, + "loss": 0.3952, + "step": 3251 + }, + { + "epoch": 2.2552011095700415, + "grad_norm": 0.3502322200532421, + "learning_rate": 6.6930290816901515e-06, + "loss": 0.4552, + "step": 3252 + }, + { + "epoch": 2.2558945908460473, + "grad_norm": 0.37785150765385184, + "learning_rate": 6.6907511722404065e-06, + "loss": 0.4182, + "step": 3253 + }, + { + "epoch": 2.2565880721220526, + "grad_norm": 0.35506041130851373, + "learning_rate": 6.688472866490832e-06, + "loss": 0.4045, + "step": 3254 + }, + { + "epoch": 2.2572815533980584, + "grad_norm": 0.3930893511399995, + "learning_rate": 6.686194164975446e-06, + "loss": 0.4436, + "step": 3255 + }, + { + "epoch": 2.2579750346740637, + "grad_norm": 0.4037885650269651, + "learning_rate": 6.683915068228357e-06, + "loss": 0.4388, + "step": 3256 + }, + { + "epoch": 2.2586685159500695, + "grad_norm": 0.6676777687031498, + "learning_rate": 6.681635576783774e-06, + "loss": 0.4533, + "step": 3257 + }, + { + "epoch": 2.259361997226075, + "grad_norm": 0.3591696255670284, + "learning_rate": 6.679355691175991e-06, + "loss": 0.4628, + "step": 3258 + }, + { + "epoch": 2.2600554785020806, + "grad_norm": 0.3681087949489095, + "learning_rate": 6.677075411939396e-06, + "loss": 0.4764, + "step": 3259 + }, + { + "epoch": 2.260748959778086, + "grad_norm": 0.35689530521389246, + "learning_rate": 6.67479473960847e-06, + "loss": 0.4254, + "step": 3260 + }, + { + "epoch": 2.2614424410540916, + "grad_norm": 0.3710448540773542, + "learning_rate": 6.672513674717785e-06, + "loss": 0.4276, + "step": 3261 + }, + { + "epoch": 2.262135922330097, + "grad_norm": 0.39646594317541917, + "learning_rate": 6.670232217802011e-06, + "loss": 0.4152, + "step": 3262 + }, + { + "epoch": 2.2628294036061027, + "grad_norm": 0.3429166393044668, + "learning_rate": 6.6679503693959e-06, + "loss": 0.3593, + "step": 3263 + }, + { + "epoch": 2.263522884882108, + "grad_norm": 0.3544717212123084, + "learning_rate": 6.665668130034302e-06, + "loss": 0.456, + "step": 3264 + }, + { + "epoch": 2.264216366158114, + "grad_norm": 0.3574817978636266, + "learning_rate": 6.663385500252157e-06, + "loss": 0.4192, + "step": 3265 + }, + { + "epoch": 2.264909847434119, + "grad_norm": 0.34640457132811586, + "learning_rate": 6.661102480584498e-06, + "loss": 0.4146, + "step": 3266 + }, + { + "epoch": 2.265603328710125, + "grad_norm": 0.3728471078648733, + "learning_rate": 6.658819071566449e-06, + "loss": 0.4664, + "step": 3267 + }, + { + "epoch": 2.2662968099861303, + "grad_norm": 0.39774466842547634, + "learning_rate": 6.656535273733222e-06, + "loss": 0.4722, + "step": 3268 + }, + { + "epoch": 2.266990291262136, + "grad_norm": 0.3808029015924205, + "learning_rate": 6.654251087620125e-06, + "loss": 0.4513, + "step": 3269 + }, + { + "epoch": 2.2676837725381414, + "grad_norm": 0.37347801231352173, + "learning_rate": 6.651966513762552e-06, + "loss": 0.4798, + "step": 3270 + }, + { + "epoch": 2.268377253814147, + "grad_norm": 0.3872374945364614, + "learning_rate": 6.649681552695994e-06, + "loss": 0.4152, + "step": 3271 + }, + { + "epoch": 2.2690707350901524, + "grad_norm": 0.36121485886596594, + "learning_rate": 6.647396204956027e-06, + "loss": 0.4138, + "step": 3272 + }, + { + "epoch": 2.269764216366158, + "grad_norm": 0.3398182085904718, + "learning_rate": 6.6451104710783206e-06, + "loss": 0.4279, + "step": 3273 + }, + { + "epoch": 2.2704576976421635, + "grad_norm": 0.47824207638988286, + "learning_rate": 6.6428243515986355e-06, + "loss": 0.4477, + "step": 3274 + }, + { + "epoch": 2.2711511789181693, + "grad_norm": 0.3685727795408177, + "learning_rate": 6.640537847052818e-06, + "loss": 0.4073, + "step": 3275 + }, + { + "epoch": 2.2718446601941746, + "grad_norm": 0.3353690864388626, + "learning_rate": 6.638250957976813e-06, + "loss": 0.434, + "step": 3276 + }, + { + "epoch": 2.2725381414701804, + "grad_norm": 0.3435652355299191, + "learning_rate": 6.635963684906646e-06, + "loss": 0.4111, + "step": 3277 + }, + { + "epoch": 2.2732316227461857, + "grad_norm": 0.4221229601666925, + "learning_rate": 6.6336760283784395e-06, + "loss": 0.4429, + "step": 3278 + }, + { + "epoch": 2.2739251040221915, + "grad_norm": 0.3448233257899806, + "learning_rate": 6.631387988928404e-06, + "loss": 0.4537, + "step": 3279 + }, + { + "epoch": 2.274618585298197, + "grad_norm": 0.36684986111303636, + "learning_rate": 6.62909956709284e-06, + "loss": 0.4281, + "step": 3280 + }, + { + "epoch": 2.2753120665742026, + "grad_norm": 0.36405331636940524, + "learning_rate": 6.626810763408134e-06, + "loss": 0.4411, + "step": 3281 + }, + { + "epoch": 2.276005547850208, + "grad_norm": 0.356841560430753, + "learning_rate": 6.6245215784107695e-06, + "loss": 0.4239, + "step": 3282 + }, + { + "epoch": 2.2766990291262137, + "grad_norm": 0.3801864988090061, + "learning_rate": 6.6222320126373105e-06, + "loss": 0.4606, + "step": 3283 + }, + { + "epoch": 2.277392510402219, + "grad_norm": 0.364347127375704, + "learning_rate": 6.619942066624417e-06, + "loss": 0.4189, + "step": 3284 + }, + { + "epoch": 2.278085991678225, + "grad_norm": 0.34802934040034456, + "learning_rate": 6.617651740908835e-06, + "loss": 0.4335, + "step": 3285 + }, + { + "epoch": 2.27877947295423, + "grad_norm": 0.34152045906723943, + "learning_rate": 6.6153610360274014e-06, + "loss": 0.4255, + "step": 3286 + }, + { + "epoch": 2.279472954230236, + "grad_norm": 0.33991863437247355, + "learning_rate": 6.61306995251704e-06, + "loss": 0.4192, + "step": 3287 + }, + { + "epoch": 2.280166435506241, + "grad_norm": 0.36702944905564105, + "learning_rate": 6.610778490914763e-06, + "loss": 0.4414, + "step": 3288 + }, + { + "epoch": 2.280859916782247, + "grad_norm": 0.3363866659981957, + "learning_rate": 6.608486651757673e-06, + "loss": 0.4132, + "step": 3289 + }, + { + "epoch": 2.2815533980582523, + "grad_norm": 0.39834279294618163, + "learning_rate": 6.6061944355829634e-06, + "loss": 0.371, + "step": 3290 + }, + { + "epoch": 2.282246879334258, + "grad_norm": 0.3590341740384732, + "learning_rate": 6.603901842927909e-06, + "loss": 0.4607, + "step": 3291 + }, + { + "epoch": 2.2829403606102634, + "grad_norm": 0.38279593231597786, + "learning_rate": 6.601608874329879e-06, + "loss": 0.403, + "step": 3292 + }, + { + "epoch": 2.283633841886269, + "grad_norm": 0.35681057740702554, + "learning_rate": 6.599315530326328e-06, + "loss": 0.4287, + "step": 3293 + }, + { + "epoch": 2.2843273231622745, + "grad_norm": 0.33483056001079603, + "learning_rate": 6.5970218114548e-06, + "loss": 0.4258, + "step": 3294 + }, + { + "epoch": 2.2850208044382803, + "grad_norm": 0.32480905748131034, + "learning_rate": 6.594727718252925e-06, + "loss": 0.3818, + "step": 3295 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.36444629498912573, + "learning_rate": 6.592433251258423e-06, + "loss": 0.4116, + "step": 3296 + }, + { + "epoch": 2.2864077669902914, + "grad_norm": 0.5245725239891985, + "learning_rate": 6.590138411009099e-06, + "loss": 0.4148, + "step": 3297 + }, + { + "epoch": 2.2871012482662967, + "grad_norm": 0.3650205096057206, + "learning_rate": 6.587843198042848e-06, + "loss": 0.461, + "step": 3298 + }, + { + "epoch": 2.2877947295423025, + "grad_norm": 0.3732647441193785, + "learning_rate": 6.585547612897653e-06, + "loss": 0.4461, + "step": 3299 + }, + { + "epoch": 2.2884882108183078, + "grad_norm": 0.37869872424797363, + "learning_rate": 6.583251656111579e-06, + "loss": 0.4488, + "step": 3300 + }, + { + "epoch": 2.2891816920943135, + "grad_norm": 0.32711844570564075, + "learning_rate": 6.580955328222782e-06, + "loss": 0.4044, + "step": 3301 + }, + { + "epoch": 2.289875173370319, + "grad_norm": 0.471035521144447, + "learning_rate": 6.578658629769507e-06, + "loss": 0.4121, + "step": 3302 + }, + { + "epoch": 2.2905686546463246, + "grad_norm": 0.3469179521041166, + "learning_rate": 6.5763615612900834e-06, + "loss": 0.4021, + "step": 3303 + }, + { + "epoch": 2.29126213592233, + "grad_norm": 0.4113219336358887, + "learning_rate": 6.574064123322925e-06, + "loss": 0.4655, + "step": 3304 + }, + { + "epoch": 2.2919556171983357, + "grad_norm": 0.36062914616033437, + "learning_rate": 6.571766316406537e-06, + "loss": 0.4355, + "step": 3305 + }, + { + "epoch": 2.292649098474341, + "grad_norm": 0.34683938613322324, + "learning_rate": 6.569468141079507e-06, + "loss": 0.4104, + "step": 3306 + }, + { + "epoch": 2.293342579750347, + "grad_norm": 0.345232532727373, + "learning_rate": 6.567169597880512e-06, + "loss": 0.3571, + "step": 3307 + }, + { + "epoch": 2.294036061026352, + "grad_norm": 0.38826318949514355, + "learning_rate": 6.564870687348312e-06, + "loss": 0.3943, + "step": 3308 + }, + { + "epoch": 2.294729542302358, + "grad_norm": 0.3717121038562584, + "learning_rate": 6.562571410021758e-06, + "loss": 0.4248, + "step": 3309 + }, + { + "epoch": 2.2954230235783633, + "grad_norm": 0.355063374301673, + "learning_rate": 6.5602717664397795e-06, + "loss": 0.4876, + "step": 3310 + }, + { + "epoch": 2.296116504854369, + "grad_norm": 0.3884043233165812, + "learning_rate": 6.557971757141402e-06, + "loss": 0.4685, + "step": 3311 + }, + { + "epoch": 2.2968099861303743, + "grad_norm": 0.3513554971994437, + "learning_rate": 6.555671382665727e-06, + "loss": 0.4386, + "step": 3312 + }, + { + "epoch": 2.29750346740638, + "grad_norm": 0.3481242369228575, + "learning_rate": 6.5533706435519454e-06, + "loss": 0.4281, + "step": 3313 + }, + { + "epoch": 2.2981969486823854, + "grad_norm": 0.36652653214075054, + "learning_rate": 6.5510695403393365e-06, + "loss": 0.4661, + "step": 3314 + }, + { + "epoch": 2.298890429958391, + "grad_norm": 0.34063580485073686, + "learning_rate": 6.548768073567258e-06, + "loss": 0.4056, + "step": 3315 + }, + { + "epoch": 2.2995839112343965, + "grad_norm": 0.9173466365303112, + "learning_rate": 6.5464662437751634e-06, + "loss": 0.425, + "step": 3316 + }, + { + "epoch": 2.3002773925104023, + "grad_norm": 0.3413504222392783, + "learning_rate": 6.5441640515025795e-06, + "loss": 0.4363, + "step": 3317 + }, + { + "epoch": 2.3009708737864076, + "grad_norm": 0.45599209442604965, + "learning_rate": 6.541861497289126e-06, + "loss": 0.4587, + "step": 3318 + }, + { + "epoch": 2.3016643550624134, + "grad_norm": 0.3560639727612127, + "learning_rate": 6.539558581674503e-06, + "loss": 0.4377, + "step": 3319 + }, + { + "epoch": 2.3023578363384187, + "grad_norm": 0.3211526490582845, + "learning_rate": 6.5372553051985e-06, + "loss": 0.4274, + "step": 3320 + }, + { + "epoch": 2.3030513176144245, + "grad_norm": 0.35060024821190416, + "learning_rate": 6.534951668400986e-06, + "loss": 0.4115, + "step": 3321 + }, + { + "epoch": 2.30374479889043, + "grad_norm": 0.3653406803746035, + "learning_rate": 6.5326476718219165e-06, + "loss": 0.4351, + "step": 3322 + }, + { + "epoch": 2.3044382801664356, + "grad_norm": 0.4708909428237561, + "learning_rate": 6.530343316001334e-06, + "loss": 0.461, + "step": 3323 + }, + { + "epoch": 2.305131761442441, + "grad_norm": 0.37451279720941105, + "learning_rate": 6.52803860147936e-06, + "loss": 0.4568, + "step": 3324 + }, + { + "epoch": 2.3058252427184467, + "grad_norm": 0.3694267698896481, + "learning_rate": 6.525733528796207e-06, + "loss": 0.4257, + "step": 3325 + }, + { + "epoch": 2.306518723994452, + "grad_norm": 0.5161161729147663, + "learning_rate": 6.523428098492163e-06, + "loss": 0.4239, + "step": 3326 + }, + { + "epoch": 2.307212205270458, + "grad_norm": 0.3886816521838468, + "learning_rate": 6.5211223111076065e-06, + "loss": 0.4327, + "step": 3327 + }, + { + "epoch": 2.307905686546463, + "grad_norm": 0.35178441767206575, + "learning_rate": 6.518816167182996e-06, + "loss": 0.4442, + "step": 3328 + }, + { + "epoch": 2.308599167822469, + "grad_norm": 0.38474584607835866, + "learning_rate": 6.516509667258877e-06, + "loss": 0.4708, + "step": 3329 + }, + { + "epoch": 2.309292649098474, + "grad_norm": 0.3695121458647729, + "learning_rate": 6.514202811875874e-06, + "loss": 0.4604, + "step": 3330 + }, + { + "epoch": 2.30998613037448, + "grad_norm": 1.131673848138997, + "learning_rate": 6.511895601574698e-06, + "loss": 0.4022, + "step": 3331 + }, + { + "epoch": 2.3106796116504853, + "grad_norm": 0.3587322375640702, + "learning_rate": 6.509588036896144e-06, + "loss": 0.3987, + "step": 3332 + }, + { + "epoch": 2.311373092926491, + "grad_norm": 0.3353590598403885, + "learning_rate": 6.507280118381085e-06, + "loss": 0.4454, + "step": 3333 + }, + { + "epoch": 2.3120665742024964, + "grad_norm": 0.3714283959907697, + "learning_rate": 6.504971846570484e-06, + "loss": 0.4124, + "step": 3334 + }, + { + "epoch": 2.312760055478502, + "grad_norm": 0.3812094155750211, + "learning_rate": 6.502663222005382e-06, + "loss": 0.4236, + "step": 3335 + }, + { + "epoch": 2.3134535367545075, + "grad_norm": 0.3452165630974439, + "learning_rate": 6.500354245226903e-06, + "loss": 0.431, + "step": 3336 + }, + { + "epoch": 2.3141470180305133, + "grad_norm": 0.3369957844501315, + "learning_rate": 6.498044916776255e-06, + "loss": 0.3756, + "step": 3337 + }, + { + "epoch": 2.3148404993065186, + "grad_norm": 0.37378829273373, + "learning_rate": 6.495735237194727e-06, + "loss": 0.4331, + "step": 3338 + }, + { + "epoch": 2.3155339805825244, + "grad_norm": 0.35482066447873756, + "learning_rate": 6.493425207023693e-06, + "loss": 0.3837, + "step": 3339 + }, + { + "epoch": 2.3162274618585297, + "grad_norm": 0.33713355082890706, + "learning_rate": 6.491114826804607e-06, + "loss": 0.3712, + "step": 3340 + }, + { + "epoch": 2.3169209431345354, + "grad_norm": 0.36309135163251544, + "learning_rate": 6.488804097079005e-06, + "loss": 0.4523, + "step": 3341 + }, + { + "epoch": 2.3176144244105408, + "grad_norm": 0.3634776405367075, + "learning_rate": 6.486493018388502e-06, + "loss": 0.3645, + "step": 3342 + }, + { + "epoch": 2.3183079056865465, + "grad_norm": 0.37242495564367256, + "learning_rate": 6.484181591274804e-06, + "loss": 0.4409, + "step": 3343 + }, + { + "epoch": 2.319001386962552, + "grad_norm": 0.35463085393063387, + "learning_rate": 6.481869816279689e-06, + "loss": 0.3976, + "step": 3344 + }, + { + "epoch": 2.3196948682385576, + "grad_norm": 0.3624813574473872, + "learning_rate": 6.479557693945022e-06, + "loss": 0.4743, + "step": 3345 + }, + { + "epoch": 2.320388349514563, + "grad_norm": 0.348819367646347, + "learning_rate": 6.477245224812746e-06, + "loss": 0.3851, + "step": 3346 + }, + { + "epoch": 2.3210818307905687, + "grad_norm": 0.34648568791108286, + "learning_rate": 6.474932409424888e-06, + "loss": 0.409, + "step": 3347 + }, + { + "epoch": 2.321775312066574, + "grad_norm": 0.3442746098904869, + "learning_rate": 6.4726192483235564e-06, + "loss": 0.4248, + "step": 3348 + }, + { + "epoch": 2.32246879334258, + "grad_norm": 0.3397393802058347, + "learning_rate": 6.470305742050938e-06, + "loss": 0.4438, + "step": 3349 + }, + { + "epoch": 2.323162274618585, + "grad_norm": 0.369794195615468, + "learning_rate": 6.4679918911493015e-06, + "loss": 0.3795, + "step": 3350 + }, + { + "epoch": 2.323855755894591, + "grad_norm": 0.3521241325240761, + "learning_rate": 6.465677696160997e-06, + "loss": 0.404, + "step": 3351 + }, + { + "epoch": 2.3245492371705962, + "grad_norm": 0.37593824832172124, + "learning_rate": 6.463363157628456e-06, + "loss": 0.389, + "step": 3352 + }, + { + "epoch": 2.325242718446602, + "grad_norm": 0.35446973660877806, + "learning_rate": 6.46104827609419e-06, + "loss": 0.4102, + "step": 3353 + }, + { + "epoch": 2.3259361997226073, + "grad_norm": 0.3778037088411012, + "learning_rate": 6.458733052100787e-06, + "loss": 0.3919, + "step": 3354 + }, + { + "epoch": 2.326629680998613, + "grad_norm": 0.36428989535407785, + "learning_rate": 6.456417486190923e-06, + "loss": 0.4491, + "step": 3355 + }, + { + "epoch": 2.3273231622746184, + "grad_norm": 0.36507726540493185, + "learning_rate": 6.454101578907348e-06, + "loss": 0.4083, + "step": 3356 + }, + { + "epoch": 2.328016643550624, + "grad_norm": 0.36794054531079673, + "learning_rate": 6.451785330792894e-06, + "loss": 0.4054, + "step": 3357 + }, + { + "epoch": 2.3287101248266295, + "grad_norm": 0.397215446334308, + "learning_rate": 6.449468742390472e-06, + "loss": 0.4266, + "step": 3358 + }, + { + "epoch": 2.3294036061026353, + "grad_norm": 0.3592690505460678, + "learning_rate": 6.447151814243075e-06, + "loss": 0.445, + "step": 3359 + }, + { + "epoch": 2.3300970873786406, + "grad_norm": 0.434944116526258, + "learning_rate": 6.444834546893773e-06, + "loss": 0.4136, + "step": 3360 + }, + { + "epoch": 2.3307905686546464, + "grad_norm": 0.34509160611793993, + "learning_rate": 6.442516940885718e-06, + "loss": 0.4011, + "step": 3361 + }, + { + "epoch": 2.3314840499306517, + "grad_norm": 0.41579465154441725, + "learning_rate": 6.440198996762139e-06, + "loss": 0.4686, + "step": 3362 + }, + { + "epoch": 2.3321775312066575, + "grad_norm": 0.36455116066509813, + "learning_rate": 6.437880715066346e-06, + "loss": 0.3866, + "step": 3363 + }, + { + "epoch": 2.332871012482663, + "grad_norm": 0.3452314364510656, + "learning_rate": 6.435562096341726e-06, + "loss": 0.4407, + "step": 3364 + }, + { + "epoch": 2.3335644937586686, + "grad_norm": 0.3638852141104239, + "learning_rate": 6.433243141131748e-06, + "loss": 0.3632, + "step": 3365 + }, + { + "epoch": 2.334257975034674, + "grad_norm": 0.33891432758834644, + "learning_rate": 6.430923849979958e-06, + "loss": 0.3857, + "step": 3366 + }, + { + "epoch": 2.3349514563106797, + "grad_norm": 0.3793851285938589, + "learning_rate": 6.42860422342998e-06, + "loss": 0.4592, + "step": 3367 + }, + { + "epoch": 2.335644937586685, + "grad_norm": 0.3513547162756251, + "learning_rate": 6.426284262025519e-06, + "loss": 0.4224, + "step": 3368 + }, + { + "epoch": 2.336338418862691, + "grad_norm": 0.35658306501930737, + "learning_rate": 6.423963966310356e-06, + "loss": 0.4145, + "step": 3369 + }, + { + "epoch": 2.337031900138696, + "grad_norm": 0.3433575800648795, + "learning_rate": 6.4216433368283535e-06, + "loss": 0.4445, + "step": 3370 + }, + { + "epoch": 2.337725381414702, + "grad_norm": 0.3446282815830537, + "learning_rate": 6.419322374123448e-06, + "loss": 0.4129, + "step": 3371 + }, + { + "epoch": 2.338418862690707, + "grad_norm": 0.3896608090763199, + "learning_rate": 6.4170010787396576e-06, + "loss": 0.4529, + "step": 3372 + }, + { + "epoch": 2.339112343966713, + "grad_norm": 0.40974544389265305, + "learning_rate": 6.4146794512210755e-06, + "loss": 0.4485, + "step": 3373 + }, + { + "epoch": 2.3398058252427183, + "grad_norm": 0.37262273430385956, + "learning_rate": 6.412357492111877e-06, + "loss": 0.4875, + "step": 3374 + }, + { + "epoch": 2.340499306518724, + "grad_norm": 0.42558664420672976, + "learning_rate": 6.410035201956311e-06, + "loss": 0.4614, + "step": 3375 + }, + { + "epoch": 2.3411927877947294, + "grad_norm": 0.35238464681827575, + "learning_rate": 6.407712581298705e-06, + "loss": 0.4675, + "step": 3376 + }, + { + "epoch": 2.341886269070735, + "grad_norm": 0.3369379914035939, + "learning_rate": 6.405389630683465e-06, + "loss": 0.4384, + "step": 3377 + }, + { + "epoch": 2.3425797503467405, + "grad_norm": 0.477112803773968, + "learning_rate": 6.403066350655074e-06, + "loss": 0.4147, + "step": 3378 + }, + { + "epoch": 2.3432732316227463, + "grad_norm": 0.36080047173989804, + "learning_rate": 6.400742741758092e-06, + "loss": 0.3987, + "step": 3379 + }, + { + "epoch": 2.3439667128987516, + "grad_norm": 0.3736865943997831, + "learning_rate": 6.3984188045371566e-06, + "loss": 0.4063, + "step": 3380 + }, + { + "epoch": 2.3446601941747574, + "grad_norm": 0.39806065460955636, + "learning_rate": 6.396094539536981e-06, + "loss": 0.4049, + "step": 3381 + }, + { + "epoch": 2.3453536754507627, + "grad_norm": 0.3562460578072664, + "learning_rate": 6.393769947302355e-06, + "loss": 0.4208, + "step": 3382 + }, + { + "epoch": 2.3460471567267684, + "grad_norm": 0.408192722083846, + "learning_rate": 6.391445028378149e-06, + "loss": 0.4524, + "step": 3383 + }, + { + "epoch": 2.3467406380027738, + "grad_norm": 0.32533670575757584, + "learning_rate": 6.389119783309306e-06, + "loss": 0.4205, + "step": 3384 + }, + { + "epoch": 2.3474341192787795, + "grad_norm": 0.37216837987975354, + "learning_rate": 6.386794212640846e-06, + "loss": 0.4087, + "step": 3385 + }, + { + "epoch": 2.348127600554785, + "grad_norm": 0.37150027196366225, + "learning_rate": 6.384468316917865e-06, + "loss": 0.4137, + "step": 3386 + }, + { + "epoch": 2.3488210818307906, + "grad_norm": 0.38447704576737873, + "learning_rate": 6.382142096685538e-06, + "loss": 0.3996, + "step": 3387 + }, + { + "epoch": 2.349514563106796, + "grad_norm": 0.40740775299773047, + "learning_rate": 6.379815552489112e-06, + "loss": 0.4536, + "step": 3388 + }, + { + "epoch": 2.3502080443828017, + "grad_norm": 0.36822169913786423, + "learning_rate": 6.377488684873917e-06, + "loss": 0.4281, + "step": 3389 + }, + { + "epoch": 2.350901525658807, + "grad_norm": 0.40016362939793576, + "learning_rate": 6.375161494385349e-06, + "loss": 0.4337, + "step": 3390 + }, + { + "epoch": 2.351595006934813, + "grad_norm": 0.33381928007705064, + "learning_rate": 6.372833981568885e-06, + "loss": 0.4451, + "step": 3391 + }, + { + "epoch": 2.352288488210818, + "grad_norm": 0.3588959932374742, + "learning_rate": 6.370506146970078e-06, + "loss": 0.4451, + "step": 3392 + }, + { + "epoch": 2.352981969486824, + "grad_norm": 0.37989940100138275, + "learning_rate": 6.368177991134558e-06, + "loss": 0.468, + "step": 3393 + }, + { + "epoch": 2.3536754507628292, + "grad_norm": 0.3410133129775475, + "learning_rate": 6.365849514608025e-06, + "loss": 0.4192, + "step": 3394 + }, + { + "epoch": 2.354368932038835, + "grad_norm": 0.44469936051079273, + "learning_rate": 6.363520717936256e-06, + "loss": 0.4518, + "step": 3395 + }, + { + "epoch": 2.3550624133148403, + "grad_norm": 0.3635685167424104, + "learning_rate": 6.361191601665107e-06, + "loss": 0.4033, + "step": 3396 + }, + { + "epoch": 2.355755894590846, + "grad_norm": 0.36245690943424497, + "learning_rate": 6.358862166340505e-06, + "loss": 0.4223, + "step": 3397 + }, + { + "epoch": 2.3564493758668514, + "grad_norm": 0.37394168257935906, + "learning_rate": 6.356532412508453e-06, + "loss": 0.4353, + "step": 3398 + }, + { + "epoch": 2.357142857142857, + "grad_norm": 0.3710994035820661, + "learning_rate": 6.354202340715027e-06, + "loss": 0.4457, + "step": 3399 + }, + { + "epoch": 2.3578363384188625, + "grad_norm": 0.38630683590497084, + "learning_rate": 6.351871951506379e-06, + "loss": 0.4431, + "step": 3400 + }, + { + "epoch": 2.3585298196948683, + "grad_norm": 0.3568683052878388, + "learning_rate": 6.349541245428737e-06, + "loss": 0.4461, + "step": 3401 + }, + { + "epoch": 2.3592233009708736, + "grad_norm": 0.38771598700185156, + "learning_rate": 6.347210223028403e-06, + "loss": 0.3685, + "step": 3402 + }, + { + "epoch": 2.3599167822468794, + "grad_norm": 0.3544011539020275, + "learning_rate": 6.344878884851748e-06, + "loss": 0.4533, + "step": 3403 + }, + { + "epoch": 2.3606102635228847, + "grad_norm": 0.3670236812365391, + "learning_rate": 6.342547231445222e-06, + "loss": 0.4402, + "step": 3404 + }, + { + "epoch": 2.3613037447988905, + "grad_norm": 0.36476568118874775, + "learning_rate": 6.340215263355348e-06, + "loss": 0.4228, + "step": 3405 + }, + { + "epoch": 2.361997226074896, + "grad_norm": 0.4717730456339519, + "learning_rate": 6.337882981128724e-06, + "loss": 0.3999, + "step": 3406 + }, + { + "epoch": 2.3626907073509016, + "grad_norm": 0.35221953625909663, + "learning_rate": 6.335550385312018e-06, + "loss": 0.3849, + "step": 3407 + }, + { + "epoch": 2.363384188626907, + "grad_norm": 0.3860207819764881, + "learning_rate": 6.3332174764519735e-06, + "loss": 0.4189, + "step": 3408 + }, + { + "epoch": 2.3640776699029127, + "grad_norm": 0.4007386812023215, + "learning_rate": 6.330884255095409e-06, + "loss": 0.4138, + "step": 3409 + }, + { + "epoch": 2.364771151178918, + "grad_norm": 0.37862490931227294, + "learning_rate": 6.328550721789214e-06, + "loss": 0.4621, + "step": 3410 + }, + { + "epoch": 2.3654646324549238, + "grad_norm": 0.35916781086557287, + "learning_rate": 6.326216877080351e-06, + "loss": 0.4342, + "step": 3411 + }, + { + "epoch": 2.366158113730929, + "grad_norm": 0.3577175898636799, + "learning_rate": 6.3238827215158575e-06, + "loss": 0.4258, + "step": 3412 + }, + { + "epoch": 2.366851595006935, + "grad_norm": 0.4217988564723029, + "learning_rate": 6.32154825564284e-06, + "loss": 0.4718, + "step": 3413 + }, + { + "epoch": 2.36754507628294, + "grad_norm": 0.3472591144956317, + "learning_rate": 6.319213480008485e-06, + "loss": 0.3976, + "step": 3414 + }, + { + "epoch": 2.368238557558946, + "grad_norm": 0.39692892084548975, + "learning_rate": 6.3168783951600445e-06, + "loss": 0.4276, + "step": 3415 + }, + { + "epoch": 2.3689320388349513, + "grad_norm": 0.48621750097547883, + "learning_rate": 6.3145430016448435e-06, + "loss": 0.3952, + "step": 3416 + }, + { + "epoch": 2.369625520110957, + "grad_norm": 0.34995196201467055, + "learning_rate": 6.312207300010285e-06, + "loss": 0.4212, + "step": 3417 + }, + { + "epoch": 2.3703190013869624, + "grad_norm": 0.3889657758249156, + "learning_rate": 6.309871290803837e-06, + "loss": 0.455, + "step": 3418 + }, + { + "epoch": 2.371012482662968, + "grad_norm": 0.3354965798611206, + "learning_rate": 6.307534974573048e-06, + "loss": 0.4471, + "step": 3419 + }, + { + "epoch": 2.3717059639389735, + "grad_norm": 0.35780678124045107, + "learning_rate": 6.305198351865527e-06, + "loss": 0.417, + "step": 3420 + }, + { + "epoch": 2.3723994452149793, + "grad_norm": 0.3537815464065854, + "learning_rate": 6.302861423228967e-06, + "loss": 0.4302, + "step": 3421 + }, + { + "epoch": 2.3730929264909846, + "grad_norm": 0.3954711687708549, + "learning_rate": 6.300524189211124e-06, + "loss": 0.4415, + "step": 3422 + }, + { + "epoch": 2.3737864077669903, + "grad_norm": 0.5590188020954749, + "learning_rate": 6.298186650359832e-06, + "loss": 0.4832, + "step": 3423 + }, + { + "epoch": 2.3744798890429957, + "grad_norm": 0.44129155814582244, + "learning_rate": 6.2958488072229895e-06, + "loss": 0.4444, + "step": 3424 + }, + { + "epoch": 2.3751733703190014, + "grad_norm": 0.6650221897421527, + "learning_rate": 6.293510660348572e-06, + "loss": 0.4262, + "step": 3425 + }, + { + "epoch": 2.3758668515950068, + "grad_norm": 0.3840587363773067, + "learning_rate": 6.291172210284624e-06, + "loss": 0.3939, + "step": 3426 + }, + { + "epoch": 2.3765603328710125, + "grad_norm": 0.3935768538349805, + "learning_rate": 6.288833457579261e-06, + "loss": 0.4241, + "step": 3427 + }, + { + "epoch": 2.377253814147018, + "grad_norm": 0.39560772177394354, + "learning_rate": 6.2864944027806684e-06, + "loss": 0.4424, + "step": 3428 + }, + { + "epoch": 2.3779472954230236, + "grad_norm": 0.34737704553925264, + "learning_rate": 6.284155046437107e-06, + "loss": 0.4113, + "step": 3429 + }, + { + "epoch": 2.378640776699029, + "grad_norm": 0.47403656541012373, + "learning_rate": 6.281815389096903e-06, + "loss": 0.4088, + "step": 3430 + }, + { + "epoch": 2.3793342579750347, + "grad_norm": 0.4200280447992286, + "learning_rate": 6.279475431308453e-06, + "loss": 0.4243, + "step": 3431 + }, + { + "epoch": 2.38002773925104, + "grad_norm": 0.43662749967588077, + "learning_rate": 6.2771351736202306e-06, + "loss": 0.4589, + "step": 3432 + }, + { + "epoch": 2.380721220527046, + "grad_norm": 0.39979458496586723, + "learning_rate": 6.27479461658077e-06, + "loss": 0.462, + "step": 3433 + }, + { + "epoch": 2.381414701803051, + "grad_norm": 0.3682603187605637, + "learning_rate": 6.272453760738686e-06, + "loss": 0.3895, + "step": 3434 + }, + { + "epoch": 2.382108183079057, + "grad_norm": 0.36024963440519453, + "learning_rate": 6.270112606642656e-06, + "loss": 0.4127, + "step": 3435 + }, + { + "epoch": 2.3828016643550622, + "grad_norm": 0.3903070950000419, + "learning_rate": 6.267771154841429e-06, + "loss": 0.4087, + "step": 3436 + }, + { + "epoch": 2.383495145631068, + "grad_norm": 0.3391321662642895, + "learning_rate": 6.265429405883825e-06, + "loss": 0.4119, + "step": 3437 + }, + { + "epoch": 2.3841886269070733, + "grad_norm": 0.3686041953326949, + "learning_rate": 6.2630873603187335e-06, + "loss": 0.391, + "step": 3438 + }, + { + "epoch": 2.384882108183079, + "grad_norm": 0.39924365282291413, + "learning_rate": 6.260745018695112e-06, + "loss": 0.4986, + "step": 3439 + }, + { + "epoch": 2.3855755894590844, + "grad_norm": 0.31873868385405696, + "learning_rate": 6.258402381561989e-06, + "loss": 0.3269, + "step": 3440 + }, + { + "epoch": 2.38626907073509, + "grad_norm": 0.3468427356660674, + "learning_rate": 6.256059449468462e-06, + "loss": 0.4511, + "step": 3441 + }, + { + "epoch": 2.3869625520110955, + "grad_norm": 0.3476113659455627, + "learning_rate": 6.253716222963695e-06, + "loss": 0.4157, + "step": 3442 + }, + { + "epoch": 2.3876560332871013, + "grad_norm": 0.39463376375605047, + "learning_rate": 6.251372702596927e-06, + "loss": 0.4244, + "step": 3443 + }, + { + "epoch": 2.3883495145631066, + "grad_norm": 0.3323736685099904, + "learning_rate": 6.24902888891746e-06, + "loss": 0.4304, + "step": 3444 + }, + { + "epoch": 2.3890429958391124, + "grad_norm": 0.385009719696181, + "learning_rate": 6.246684782474665e-06, + "loss": 0.4809, + "step": 3445 + }, + { + "epoch": 2.3897364771151177, + "grad_norm": 0.3946421860197158, + "learning_rate": 6.244340383817989e-06, + "loss": 0.4577, + "step": 3446 + }, + { + "epoch": 2.3904299583911235, + "grad_norm": 0.39202685932477294, + "learning_rate": 6.241995693496939e-06, + "loss": 0.3802, + "step": 3447 + }, + { + "epoch": 2.391123439667129, + "grad_norm": 0.3638305833865608, + "learning_rate": 6.239650712061093e-06, + "loss": 0.4427, + "step": 3448 + }, + { + "epoch": 2.3918169209431346, + "grad_norm": 0.39198565540149627, + "learning_rate": 6.237305440060096e-06, + "loss": 0.4174, + "step": 3449 + }, + { + "epoch": 2.39251040221914, + "grad_norm": 0.37992019651591874, + "learning_rate": 6.234959878043667e-06, + "loss": 0.4051, + "step": 3450 + }, + { + "epoch": 2.3932038834951457, + "grad_norm": 0.38178358197512036, + "learning_rate": 6.232614026561586e-06, + "loss": 0.4343, + "step": 3451 + }, + { + "epoch": 2.393897364771151, + "grad_norm": 0.35388101649029896, + "learning_rate": 6.2302678861637044e-06, + "loss": 0.3979, + "step": 3452 + }, + { + "epoch": 2.3945908460471568, + "grad_norm": 0.3296789306417218, + "learning_rate": 6.2279214573999405e-06, + "loss": 0.4479, + "step": 3453 + }, + { + "epoch": 2.395284327323162, + "grad_norm": 0.35184432654620773, + "learning_rate": 6.225574740820278e-06, + "loss": 0.4292, + "step": 3454 + }, + { + "epoch": 2.395977808599168, + "grad_norm": 0.36142021844379674, + "learning_rate": 6.2232277369747755e-06, + "loss": 0.4408, + "step": 3455 + }, + { + "epoch": 2.396671289875173, + "grad_norm": 0.33277075131702893, + "learning_rate": 6.220880446413548e-06, + "loss": 0.4184, + "step": 3456 + }, + { + "epoch": 2.397364771151179, + "grad_norm": 0.3574119895278261, + "learning_rate": 6.2185328696867866e-06, + "loss": 0.4641, + "step": 3457 + }, + { + "epoch": 2.3980582524271843, + "grad_norm": 0.3700609396267541, + "learning_rate": 6.216185007344745e-06, + "loss": 0.4284, + "step": 3458 + }, + { + "epoch": 2.39875173370319, + "grad_norm": 0.3549329881454592, + "learning_rate": 6.2138368599377465e-06, + "loss": 0.3992, + "step": 3459 + }, + { + "epoch": 2.3994452149791954, + "grad_norm": 0.3568537084222683, + "learning_rate": 6.211488428016179e-06, + "loss": 0.4096, + "step": 3460 + }, + { + "epoch": 2.400138696255201, + "grad_norm": 0.3430531484905292, + "learning_rate": 6.209139712130499e-06, + "loss": 0.4464, + "step": 3461 + }, + { + "epoch": 2.4008321775312065, + "grad_norm": 0.38566858567711926, + "learning_rate": 6.206790712831225e-06, + "loss": 0.4135, + "step": 3462 + }, + { + "epoch": 2.4015256588072122, + "grad_norm": 0.4019088519638128, + "learning_rate": 6.204441430668949e-06, + "loss": 0.4297, + "step": 3463 + }, + { + "epoch": 2.4022191400832176, + "grad_norm": 0.35686918217779046, + "learning_rate": 6.2020918661943265e-06, + "loss": 0.3962, + "step": 3464 + }, + { + "epoch": 2.4029126213592233, + "grad_norm": 0.34747328665351357, + "learning_rate": 6.199742019958074e-06, + "loss": 0.4198, + "step": 3465 + }, + { + "epoch": 2.4036061026352287, + "grad_norm": 0.3719162458419108, + "learning_rate": 6.197391892510982e-06, + "loss": 0.4299, + "step": 3466 + }, + { + "epoch": 2.4042995839112344, + "grad_norm": 0.3310709790808494, + "learning_rate": 6.195041484403902e-06, + "loss": 0.4139, + "step": 3467 + }, + { + "epoch": 2.4049930651872398, + "grad_norm": 0.3798672827503458, + "learning_rate": 6.192690796187753e-06, + "loss": 0.426, + "step": 3468 + }, + { + "epoch": 2.4056865464632455, + "grad_norm": 0.3819894485018657, + "learning_rate": 6.19033982841352e-06, + "loss": 0.3987, + "step": 3469 + }, + { + "epoch": 2.406380027739251, + "grad_norm": 0.39491406632810266, + "learning_rate": 6.1879885816322515e-06, + "loss": 0.4372, + "step": 3470 + }, + { + "epoch": 2.4070735090152566, + "grad_norm": 0.35578187182088616, + "learning_rate": 6.1856370563950615e-06, + "loss": 0.4168, + "step": 3471 + }, + { + "epoch": 2.407766990291262, + "grad_norm": 0.41040564938520374, + "learning_rate": 6.183285253253135e-06, + "loss": 0.4544, + "step": 3472 + }, + { + "epoch": 2.4084604715672677, + "grad_norm": 0.3829588038488335, + "learning_rate": 6.180933172757715e-06, + "loss": 0.4788, + "step": 3473 + }, + { + "epoch": 2.409153952843273, + "grad_norm": 0.36089509034582007, + "learning_rate": 6.17858081546011e-06, + "loss": 0.4531, + "step": 3474 + }, + { + "epoch": 2.409847434119279, + "grad_norm": 0.3335533790964443, + "learning_rate": 6.176228181911699e-06, + "loss": 0.4461, + "step": 3475 + }, + { + "epoch": 2.410540915395284, + "grad_norm": 0.3577332596693892, + "learning_rate": 6.173875272663919e-06, + "loss": 0.4372, + "step": 3476 + }, + { + "epoch": 2.41123439667129, + "grad_norm": 0.35065257131004135, + "learning_rate": 6.171522088268279e-06, + "loss": 0.4337, + "step": 3477 + }, + { + "epoch": 2.4119278779472952, + "grad_norm": 0.3656127154704781, + "learning_rate": 6.169168629276344e-06, + "loss": 0.3947, + "step": 3478 + }, + { + "epoch": 2.412621359223301, + "grad_norm": 0.35555102413957235, + "learning_rate": 6.1668148962397525e-06, + "loss": 0.429, + "step": 3479 + }, + { + "epoch": 2.4133148404993063, + "grad_norm": 0.39728286825182824, + "learning_rate": 6.164460889710196e-06, + "loss": 0.4534, + "step": 3480 + }, + { + "epoch": 2.414008321775312, + "grad_norm": 0.3775536266401251, + "learning_rate": 6.162106610239444e-06, + "loss": 0.4532, + "step": 3481 + }, + { + "epoch": 2.4147018030513174, + "grad_norm": 0.3848447949989429, + "learning_rate": 6.159752058379317e-06, + "loss": 0.4142, + "step": 3482 + }, + { + "epoch": 2.415395284327323, + "grad_norm": 0.3969949138569901, + "learning_rate": 6.157397234681708e-06, + "loss": 0.4592, + "step": 3483 + }, + { + "epoch": 2.4160887656033285, + "grad_norm": 0.3798449635887271, + "learning_rate": 6.155042139698568e-06, + "loss": 0.4945, + "step": 3484 + }, + { + "epoch": 2.4167822468793343, + "grad_norm": 0.4143446886855249, + "learning_rate": 6.152686773981916e-06, + "loss": 0.4362, + "step": 3485 + }, + { + "epoch": 2.4174757281553396, + "grad_norm": 0.38027909160910556, + "learning_rate": 6.150331138083833e-06, + "loss": 0.4299, + "step": 3486 + }, + { + "epoch": 2.4181692094313454, + "grad_norm": 0.3584462250679504, + "learning_rate": 6.147975232556463e-06, + "loss": 0.4575, + "step": 3487 + }, + { + "epoch": 2.4188626907073507, + "grad_norm": 0.36196191936942634, + "learning_rate": 6.145619057952012e-06, + "loss": 0.4479, + "step": 3488 + }, + { + "epoch": 2.4195561719833565, + "grad_norm": 0.3739003319311505, + "learning_rate": 6.14326261482275e-06, + "loss": 0.4457, + "step": 3489 + }, + { + "epoch": 2.420249653259362, + "grad_norm": 0.403643357356033, + "learning_rate": 6.1409059037210095e-06, + "loss": 0.4502, + "step": 3490 + }, + { + "epoch": 2.4209431345353676, + "grad_norm": 0.3882920801237256, + "learning_rate": 6.13854892519919e-06, + "loss": 0.4625, + "step": 3491 + }, + { + "epoch": 2.421636615811373, + "grad_norm": 0.3433730370421034, + "learning_rate": 6.136191679809749e-06, + "loss": 0.4126, + "step": 3492 + }, + { + "epoch": 2.4223300970873787, + "grad_norm": 0.37221985738254343, + "learning_rate": 6.133834168105206e-06, + "loss": 0.4085, + "step": 3493 + }, + { + "epoch": 2.423023578363384, + "grad_norm": 0.370193556439351, + "learning_rate": 6.131476390638145e-06, + "loss": 0.4347, + "step": 3494 + }, + { + "epoch": 2.4237170596393898, + "grad_norm": 0.4214519545995449, + "learning_rate": 6.129118347961214e-06, + "loss": 0.4533, + "step": 3495 + }, + { + "epoch": 2.424410540915395, + "grad_norm": 0.37541750233962984, + "learning_rate": 6.126760040627119e-06, + "loss": 0.4329, + "step": 3496 + }, + { + "epoch": 2.425104022191401, + "grad_norm": 0.3888424675714201, + "learning_rate": 6.124401469188631e-06, + "loss": 0.4654, + "step": 3497 + }, + { + "epoch": 2.425797503467406, + "grad_norm": 0.35927907410069454, + "learning_rate": 6.12204263419858e-06, + "loss": 0.4069, + "step": 3498 + }, + { + "epoch": 2.426490984743412, + "grad_norm": 0.37705068260350294, + "learning_rate": 6.119683536209864e-06, + "loss": 0.4952, + "step": 3499 + }, + { + "epoch": 2.4271844660194173, + "grad_norm": 0.47835491713331174, + "learning_rate": 6.117324175775435e-06, + "loss": 0.4289, + "step": 3500 + }, + { + "epoch": 2.427877947295423, + "grad_norm": 0.3462358718952392, + "learning_rate": 6.114964553448313e-06, + "loss": 0.3877, + "step": 3501 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 0.4277926935878686, + "learning_rate": 6.112604669781572e-06, + "loss": 0.4543, + "step": 3502 + }, + { + "epoch": 2.429264909847434, + "grad_norm": 0.4495055019010333, + "learning_rate": 6.110244525328356e-06, + "loss": 0.4604, + "step": 3503 + }, + { + "epoch": 2.4299583911234395, + "grad_norm": 0.3625289822831497, + "learning_rate": 6.107884120641863e-06, + "loss": 0.4029, + "step": 3504 + }, + { + "epoch": 2.4306518723994452, + "grad_norm": 0.3693588124354737, + "learning_rate": 6.105523456275358e-06, + "loss": 0.4685, + "step": 3505 + }, + { + "epoch": 2.4313453536754506, + "grad_norm": 0.3685339665885883, + "learning_rate": 6.10316253278216e-06, + "loss": 0.427, + "step": 3506 + }, + { + "epoch": 2.4320388349514563, + "grad_norm": 0.3697019807561721, + "learning_rate": 6.100801350715652e-06, + "loss": 0.3954, + "step": 3507 + }, + { + "epoch": 2.4327323162274617, + "grad_norm": 0.363318103111065, + "learning_rate": 6.098439910629282e-06, + "loss": 0.4114, + "step": 3508 + }, + { + "epoch": 2.4334257975034674, + "grad_norm": 0.3266315487180377, + "learning_rate": 6.096078213076553e-06, + "loss": 0.419, + "step": 3509 + }, + { + "epoch": 2.4341192787794728, + "grad_norm": 0.4050230145362845, + "learning_rate": 6.093716258611028e-06, + "loss": 0.4872, + "step": 3510 + }, + { + "epoch": 2.4348127600554785, + "grad_norm": 0.38873262828825567, + "learning_rate": 6.091354047786333e-06, + "loss": 0.3918, + "step": 3511 + }, + { + "epoch": 2.435506241331484, + "grad_norm": 0.36588040289423995, + "learning_rate": 6.088991581156152e-06, + "loss": 0.41, + "step": 3512 + }, + { + "epoch": 2.4361997226074896, + "grad_norm": 0.3774239461687723, + "learning_rate": 6.086628859274233e-06, + "loss": 0.4458, + "step": 3513 + }, + { + "epoch": 2.436893203883495, + "grad_norm": 0.35692562807169503, + "learning_rate": 6.084265882694378e-06, + "loss": 0.3932, + "step": 3514 + }, + { + "epoch": 2.4375866851595007, + "grad_norm": 0.37598880536831636, + "learning_rate": 6.081902651970453e-06, + "loss": 0.429, + "step": 3515 + }, + { + "epoch": 2.438280166435506, + "grad_norm": 0.37778554976321177, + "learning_rate": 6.079539167656382e-06, + "loss": 0.4405, + "step": 3516 + }, + { + "epoch": 2.438973647711512, + "grad_norm": 0.41604532500207625, + "learning_rate": 6.077175430306148e-06, + "loss": 0.4417, + "step": 3517 + }, + { + "epoch": 2.4396671289875176, + "grad_norm": 0.37668432889024267, + "learning_rate": 6.074811440473795e-06, + "loss": 0.4255, + "step": 3518 + }, + { + "epoch": 2.440360610263523, + "grad_norm": 0.34923433939653253, + "learning_rate": 6.0724471987134245e-06, + "loss": 0.3966, + "step": 3519 + }, + { + "epoch": 2.4410540915395282, + "grad_norm": 0.37713864245797907, + "learning_rate": 6.070082705579198e-06, + "loss": 0.4256, + "step": 3520 + }, + { + "epoch": 2.441747572815534, + "grad_norm": 0.47284460768485814, + "learning_rate": 6.0677179616253345e-06, + "loss": 0.4094, + "step": 3521 + }, + { + "epoch": 2.4424410540915398, + "grad_norm": 0.3865985984685952, + "learning_rate": 6.065352967406114e-06, + "loss": 0.4521, + "step": 3522 + }, + { + "epoch": 2.443134535367545, + "grad_norm": 0.3400910164485317, + "learning_rate": 6.062987723475873e-06, + "loss": 0.4126, + "step": 3523 + }, + { + "epoch": 2.4438280166435504, + "grad_norm": 0.36605467331989594, + "learning_rate": 6.060622230389008e-06, + "loss": 0.4153, + "step": 3524 + }, + { + "epoch": 2.444521497919556, + "grad_norm": 0.34121558679763603, + "learning_rate": 6.058256488699974e-06, + "loss": 0.4017, + "step": 3525 + }, + { + "epoch": 2.445214979195562, + "grad_norm": 0.39246465973234645, + "learning_rate": 6.055890498963284e-06, + "loss": 0.4629, + "step": 3526 + }, + { + "epoch": 2.4459084604715673, + "grad_norm": 0.3637550735351632, + "learning_rate": 6.053524261733508e-06, + "loss": 0.4641, + "step": 3527 + }, + { + "epoch": 2.4466019417475726, + "grad_norm": 0.3841111199229264, + "learning_rate": 6.0511577775652744e-06, + "loss": 0.4158, + "step": 3528 + }, + { + "epoch": 2.4472954230235784, + "grad_norm": 0.36406369231437513, + "learning_rate": 6.048791047013272e-06, + "loss": 0.4249, + "step": 3529 + }, + { + "epoch": 2.447988904299584, + "grad_norm": 0.37439954664911407, + "learning_rate": 6.046424070632241e-06, + "loss": 0.4294, + "step": 3530 + }, + { + "epoch": 2.4486823855755895, + "grad_norm": 0.33899110882039546, + "learning_rate": 6.044056848976988e-06, + "loss": 0.384, + "step": 3531 + }, + { + "epoch": 2.449375866851595, + "grad_norm": 0.4084797866489152, + "learning_rate": 6.041689382602372e-06, + "loss": 0.4447, + "step": 3532 + }, + { + "epoch": 2.4500693481276006, + "grad_norm": 0.4280099837633069, + "learning_rate": 6.039321672063308e-06, + "loss": 0.4501, + "step": 3533 + }, + { + "epoch": 2.4507628294036063, + "grad_norm": 0.3928584550836353, + "learning_rate": 6.036953717914771e-06, + "loss": 0.4167, + "step": 3534 + }, + { + "epoch": 2.4514563106796117, + "grad_norm": 0.34285367268244193, + "learning_rate": 6.034585520711792e-06, + "loss": 0.3974, + "step": 3535 + }, + { + "epoch": 2.452149791955617, + "grad_norm": 0.3484333096468712, + "learning_rate": 6.0322170810094606e-06, + "loss": 0.4106, + "step": 3536 + }, + { + "epoch": 2.4528432732316228, + "grad_norm": 0.36921100923026773, + "learning_rate": 6.029848399362921e-06, + "loss": 0.3723, + "step": 3537 + }, + { + "epoch": 2.4535367545076285, + "grad_norm": 0.3725268524285565, + "learning_rate": 6.027479476327376e-06, + "loss": 0.3896, + "step": 3538 + }, + { + "epoch": 2.454230235783634, + "grad_norm": 0.3655224259408961, + "learning_rate": 6.02511031245808e-06, + "loss": 0.3995, + "step": 3539 + }, + { + "epoch": 2.454923717059639, + "grad_norm": 0.3598975146850662, + "learning_rate": 6.022740908310354e-06, + "loss": 0.4762, + "step": 3540 + }, + { + "epoch": 2.455617198335645, + "grad_norm": 0.38659460067365026, + "learning_rate": 6.020371264439566e-06, + "loss": 0.4543, + "step": 3541 + }, + { + "epoch": 2.4563106796116507, + "grad_norm": 0.37124608830701855, + "learning_rate": 6.018001381401143e-06, + "loss": 0.4279, + "step": 3542 + }, + { + "epoch": 2.457004160887656, + "grad_norm": 0.4545050738953536, + "learning_rate": 6.015631259750568e-06, + "loss": 0.4661, + "step": 3543 + }, + { + "epoch": 2.4576976421636614, + "grad_norm": 0.4153803989283625, + "learning_rate": 6.013260900043381e-06, + "loss": 0.4452, + "step": 3544 + }, + { + "epoch": 2.458391123439667, + "grad_norm": 0.4653826703965434, + "learning_rate": 6.01089030283518e-06, + "loss": 0.4573, + "step": 3545 + }, + { + "epoch": 2.459084604715673, + "grad_norm": 0.3557503352742577, + "learning_rate": 6.008519468681612e-06, + "loss": 0.4483, + "step": 3546 + }, + { + "epoch": 2.4597780859916782, + "grad_norm": 0.3781927730213355, + "learning_rate": 6.006148398138383e-06, + "loss": 0.4116, + "step": 3547 + }, + { + "epoch": 2.4604715672676836, + "grad_norm": 0.34923264673005194, + "learning_rate": 6.003777091761257e-06, + "loss": 0.4043, + "step": 3548 + }, + { + "epoch": 2.4611650485436893, + "grad_norm": 0.36707061518528994, + "learning_rate": 6.001405550106052e-06, + "loss": 0.4249, + "step": 3549 + }, + { + "epoch": 2.461858529819695, + "grad_norm": 0.3506262194345391, + "learning_rate": 5.999033773728637e-06, + "loss": 0.4103, + "step": 3550 + }, + { + "epoch": 2.4625520110957004, + "grad_norm": 0.35118606152034737, + "learning_rate": 5.996661763184941e-06, + "loss": 0.4691, + "step": 3551 + }, + { + "epoch": 2.4632454923717058, + "grad_norm": 0.3696119369979769, + "learning_rate": 5.994289519030946e-06, + "loss": 0.4536, + "step": 3552 + }, + { + "epoch": 2.4639389736477115, + "grad_norm": 0.3368421069519799, + "learning_rate": 5.991917041822689e-06, + "loss": 0.3999, + "step": 3553 + }, + { + "epoch": 2.4646324549237173, + "grad_norm": 0.38561838631313217, + "learning_rate": 5.9895443321162615e-06, + "loss": 0.4684, + "step": 3554 + }, + { + "epoch": 2.4653259361997226, + "grad_norm": 0.3903469011689506, + "learning_rate": 5.987171390467808e-06, + "loss": 0.4104, + "step": 3555 + }, + { + "epoch": 2.466019417475728, + "grad_norm": 0.36896222945766727, + "learning_rate": 5.9847982174335314e-06, + "loss": 0.4161, + "step": 3556 + }, + { + "epoch": 2.4667128987517337, + "grad_norm": 0.38581153329654083, + "learning_rate": 5.982424813569684e-06, + "loss": 0.4483, + "step": 3557 + }, + { + "epoch": 2.4674063800277395, + "grad_norm": 0.3822456506248116, + "learning_rate": 5.980051179432575e-06, + "loss": 0.4406, + "step": 3558 + }, + { + "epoch": 2.468099861303745, + "grad_norm": 0.35559358171512856, + "learning_rate": 5.97767731557857e-06, + "loss": 0.4235, + "step": 3559 + }, + { + "epoch": 2.46879334257975, + "grad_norm": 0.34699049854280395, + "learning_rate": 5.975303222564079e-06, + "loss": 0.3956, + "step": 3560 + }, + { + "epoch": 2.469486823855756, + "grad_norm": 0.3590407573688313, + "learning_rate": 5.972928900945578e-06, + "loss": 0.4391, + "step": 3561 + }, + { + "epoch": 2.4701803051317617, + "grad_norm": 0.3690075310976927, + "learning_rate": 5.97055435127959e-06, + "loss": 0.4586, + "step": 3562 + }, + { + "epoch": 2.470873786407767, + "grad_norm": 0.37842562549053155, + "learning_rate": 5.96817957412269e-06, + "loss": 0.4242, + "step": 3563 + }, + { + "epoch": 2.4715672676837723, + "grad_norm": 0.32643519567856305, + "learning_rate": 5.965804570031508e-06, + "loss": 0.4156, + "step": 3564 + }, + { + "epoch": 2.472260748959778, + "grad_norm": 0.36356557827169317, + "learning_rate": 5.963429339562731e-06, + "loss": 0.4422, + "step": 3565 + }, + { + "epoch": 2.472954230235784, + "grad_norm": 0.32838023738047295, + "learning_rate": 5.961053883273095e-06, + "loss": 0.4091, + "step": 3566 + }, + { + "epoch": 2.473647711511789, + "grad_norm": 0.4418368573176722, + "learning_rate": 5.958678201719389e-06, + "loss": 0.3887, + "step": 3567 + }, + { + "epoch": 2.4743411927877945, + "grad_norm": 0.3456252120828457, + "learning_rate": 5.9563022954584545e-06, + "loss": 0.4132, + "step": 3568 + }, + { + "epoch": 2.4750346740638003, + "grad_norm": 0.39403659649425854, + "learning_rate": 5.953926165047189e-06, + "loss": 0.4126, + "step": 3569 + }, + { + "epoch": 2.475728155339806, + "grad_norm": 0.3497396008806446, + "learning_rate": 5.951549811042539e-06, + "loss": 0.4301, + "step": 3570 + }, + { + "epoch": 2.4764216366158114, + "grad_norm": 0.36668609408187486, + "learning_rate": 5.949173234001504e-06, + "loss": 0.4193, + "step": 3571 + }, + { + "epoch": 2.4771151178918167, + "grad_norm": 0.38766503952118897, + "learning_rate": 5.946796434481137e-06, + "loss": 0.4382, + "step": 3572 + }, + { + "epoch": 2.4778085991678225, + "grad_norm": 0.3745107517147705, + "learning_rate": 5.944419413038544e-06, + "loss": 0.3907, + "step": 3573 + }, + { + "epoch": 2.4785020804438282, + "grad_norm": 0.35816419270557776, + "learning_rate": 5.942042170230879e-06, + "loss": 0.415, + "step": 3574 + }, + { + "epoch": 2.4791955617198336, + "grad_norm": 0.36245015715227935, + "learning_rate": 5.939664706615352e-06, + "loss": 0.4235, + "step": 3575 + }, + { + "epoch": 2.479889042995839, + "grad_norm": 0.42959334504062235, + "learning_rate": 5.937287022749223e-06, + "loss": 0.4475, + "step": 3576 + }, + { + "epoch": 2.4805825242718447, + "grad_norm": 0.3883778674941454, + "learning_rate": 5.934909119189806e-06, + "loss": 0.3753, + "step": 3577 + }, + { + "epoch": 2.4812760055478504, + "grad_norm": 0.34801657442294465, + "learning_rate": 5.932530996494461e-06, + "loss": 0.407, + "step": 3578 + }, + { + "epoch": 2.4819694868238558, + "grad_norm": 0.3699175010091076, + "learning_rate": 5.930152655220603e-06, + "loss": 0.389, + "step": 3579 + }, + { + "epoch": 2.482662968099861, + "grad_norm": 0.43406862954324715, + "learning_rate": 5.9277740959257e-06, + "loss": 0.4265, + "step": 3580 + }, + { + "epoch": 2.483356449375867, + "grad_norm": 0.3753728386909298, + "learning_rate": 5.925395319167268e-06, + "loss": 0.4493, + "step": 3581 + }, + { + "epoch": 2.4840499306518726, + "grad_norm": 0.35952262852375005, + "learning_rate": 5.923016325502877e-06, + "loss": 0.3805, + "step": 3582 + }, + { + "epoch": 2.484743411927878, + "grad_norm": 0.37871090159875964, + "learning_rate": 5.920637115490142e-06, + "loss": 0.4933, + "step": 3583 + }, + { + "epoch": 2.4854368932038833, + "grad_norm": 0.34136792331318594, + "learning_rate": 5.918257689686736e-06, + "loss": 0.374, + "step": 3584 + }, + { + "epoch": 2.486130374479889, + "grad_norm": 0.3621451250815319, + "learning_rate": 5.915878048650376e-06, + "loss": 0.4056, + "step": 3585 + }, + { + "epoch": 2.486823855755895, + "grad_norm": 0.37449972640169954, + "learning_rate": 5.9134981929388365e-06, + "loss": 0.4089, + "step": 3586 + }, + { + "epoch": 2.4875173370319, + "grad_norm": 0.3590037832570276, + "learning_rate": 5.911118123109937e-06, + "loss": 0.4493, + "step": 3587 + }, + { + "epoch": 2.4882108183079055, + "grad_norm": 0.3463147956300701, + "learning_rate": 5.9087378397215454e-06, + "loss": 0.4313, + "step": 3588 + }, + { + "epoch": 2.4889042995839112, + "grad_norm": 0.3676590254111761, + "learning_rate": 5.906357343331587e-06, + "loss": 0.3864, + "step": 3589 + }, + { + "epoch": 2.489597780859917, + "grad_norm": 0.33792631019747793, + "learning_rate": 5.903976634498032e-06, + "loss": 0.3962, + "step": 3590 + }, + { + "epoch": 2.4902912621359223, + "grad_norm": 0.3908004843816045, + "learning_rate": 5.9015957137789006e-06, + "loss": 0.4741, + "step": 3591 + }, + { + "epoch": 2.4909847434119277, + "grad_norm": 0.39982125493541454, + "learning_rate": 5.899214581732262e-06, + "loss": 0.4227, + "step": 3592 + }, + { + "epoch": 2.4916782246879334, + "grad_norm": 0.3714182417065424, + "learning_rate": 5.8968332389162395e-06, + "loss": 0.4359, + "step": 3593 + }, + { + "epoch": 2.492371705963939, + "grad_norm": 0.3659110301562501, + "learning_rate": 5.894451685889001e-06, + "loss": 0.478, + "step": 3594 + }, + { + "epoch": 2.4930651872399445, + "grad_norm": 0.3578060360196927, + "learning_rate": 5.892069923208765e-06, + "loss": 0.4271, + "step": 3595 + }, + { + "epoch": 2.49375866851595, + "grad_norm": 0.46400941665591694, + "learning_rate": 5.889687951433799e-06, + "loss": 0.4859, + "step": 3596 + }, + { + "epoch": 2.4944521497919556, + "grad_norm": 0.34094717603291347, + "learning_rate": 5.88730577112242e-06, + "loss": 0.4468, + "step": 3597 + }, + { + "epoch": 2.4951456310679614, + "grad_norm": 0.33934210232918094, + "learning_rate": 5.8849233828329964e-06, + "loss": 0.4303, + "step": 3598 + }, + { + "epoch": 2.4958391123439667, + "grad_norm": 0.3756564374773508, + "learning_rate": 5.88254078712394e-06, + "loss": 0.4781, + "step": 3599 + }, + { + "epoch": 2.496532593619972, + "grad_norm": 0.32980797659104166, + "learning_rate": 5.880157984553714e-06, + "loss": 0.3996, + "step": 3600 + }, + { + "epoch": 2.497226074895978, + "grad_norm": 0.3400914728972975, + "learning_rate": 5.877774975680831e-06, + "loss": 0.3972, + "step": 3601 + }, + { + "epoch": 2.4979195561719836, + "grad_norm": 0.3357533296775801, + "learning_rate": 5.875391761063851e-06, + "loss": 0.4651, + "step": 3602 + }, + { + "epoch": 2.498613037447989, + "grad_norm": 0.32063844730744445, + "learning_rate": 5.873008341261383e-06, + "loss": 0.4244, + "step": 3603 + }, + { + "epoch": 2.4993065187239942, + "grad_norm": 0.375169256208774, + "learning_rate": 5.870624716832083e-06, + "loss": 0.4301, + "step": 3604 + }, + { + "epoch": 2.5, + "grad_norm": 0.3475080822390931, + "learning_rate": 5.8682408883346535e-06, + "loss": 0.4085, + "step": 3605 + }, + { + "epoch": 2.5006934812760058, + "grad_norm": 0.36772826908546163, + "learning_rate": 5.865856856327846e-06, + "loss": 0.4092, + "step": 3606 + }, + { + "epoch": 2.501386962552011, + "grad_norm": 0.3627256418545802, + "learning_rate": 5.8634726213704655e-06, + "loss": 0.4287, + "step": 3607 + }, + { + "epoch": 2.5020804438280164, + "grad_norm": 0.4019754729879606, + "learning_rate": 5.861088184021355e-06, + "loss": 0.4462, + "step": 3608 + }, + { + "epoch": 2.502773925104022, + "grad_norm": 0.4945247373056535, + "learning_rate": 5.858703544839409e-06, + "loss": 0.4067, + "step": 3609 + }, + { + "epoch": 2.503467406380028, + "grad_norm": 0.3735384090853162, + "learning_rate": 5.856318704383572e-06, + "loss": 0.4371, + "step": 3610 + }, + { + "epoch": 2.5041608876560333, + "grad_norm": 0.3758964298783903, + "learning_rate": 5.853933663212833e-06, + "loss": 0.384, + "step": 3611 + }, + { + "epoch": 2.5048543689320386, + "grad_norm": 0.3642334855044776, + "learning_rate": 5.8515484218862286e-06, + "loss": 0.3896, + "step": 3612 + }, + { + "epoch": 2.5055478502080444, + "grad_norm": 0.38987795882478504, + "learning_rate": 5.849162980962839e-06, + "loss": 0.4152, + "step": 3613 + }, + { + "epoch": 2.50624133148405, + "grad_norm": 0.35814210764448684, + "learning_rate": 5.8467773410017995e-06, + "loss": 0.3991, + "step": 3614 + }, + { + "epoch": 2.5069348127600555, + "grad_norm": 0.4023951436808565, + "learning_rate": 5.844391502562281e-06, + "loss": 0.4031, + "step": 3615 + }, + { + "epoch": 2.507628294036061, + "grad_norm": 0.6038430228135157, + "learning_rate": 5.842005466203511e-06, + "loss": 0.4175, + "step": 3616 + }, + { + "epoch": 2.5083217753120666, + "grad_norm": 0.36399073534256104, + "learning_rate": 5.839619232484758e-06, + "loss": 0.4646, + "step": 3617 + }, + { + "epoch": 2.5090152565880723, + "grad_norm": 0.37808237247968673, + "learning_rate": 5.837232801965338e-06, + "loss": 0.45, + "step": 3618 + }, + { + "epoch": 2.5097087378640777, + "grad_norm": 0.4126439871763773, + "learning_rate": 5.834846175204612e-06, + "loss": 0.4647, + "step": 3619 + }, + { + "epoch": 2.510402219140083, + "grad_norm": 0.3525077089200721, + "learning_rate": 5.832459352761989e-06, + "loss": 0.3847, + "step": 3620 + }, + { + "epoch": 2.5110957004160888, + "grad_norm": 0.3593513739651244, + "learning_rate": 5.830072335196921e-06, + "loss": 0.4512, + "step": 3621 + }, + { + "epoch": 2.5117891816920945, + "grad_norm": 1.5823414805917269, + "learning_rate": 5.827685123068912e-06, + "loss": 0.3924, + "step": 3622 + }, + { + "epoch": 2.5124826629681, + "grad_norm": 0.389686569362526, + "learning_rate": 5.825297716937503e-06, + "loss": 0.4216, + "step": 3623 + }, + { + "epoch": 2.513176144244105, + "grad_norm": 0.35500195083809455, + "learning_rate": 5.822910117362287e-06, + "loss": 0.4264, + "step": 3624 + }, + { + "epoch": 2.513869625520111, + "grad_norm": 0.3666874572909834, + "learning_rate": 5.820522324902899e-06, + "loss": 0.4945, + "step": 3625 + }, + { + "epoch": 2.5145631067961167, + "grad_norm": 0.36266210708471175, + "learning_rate": 5.818134340119021e-06, + "loss": 0.4217, + "step": 3626 + }, + { + "epoch": 2.515256588072122, + "grad_norm": 0.37129892582074814, + "learning_rate": 5.815746163570378e-06, + "loss": 0.4046, + "step": 3627 + }, + { + "epoch": 2.5159500693481274, + "grad_norm": 0.3676494083903266, + "learning_rate": 5.813357795816742e-06, + "loss": 0.3646, + "step": 3628 + }, + { + "epoch": 2.516643550624133, + "grad_norm": 0.40734484978334934, + "learning_rate": 5.81096923741793e-06, + "loss": 0.4124, + "step": 3629 + }, + { + "epoch": 2.517337031900139, + "grad_norm": 0.3497720871251518, + "learning_rate": 5.8085804889338014e-06, + "loss": 0.4193, + "step": 3630 + }, + { + "epoch": 2.5180305131761442, + "grad_norm": 0.3842406086139197, + "learning_rate": 5.806191550924264e-06, + "loss": 0.4444, + "step": 3631 + }, + { + "epoch": 2.5187239944521496, + "grad_norm": 0.4316227415806176, + "learning_rate": 5.803802423949265e-06, + "loss": 0.4265, + "step": 3632 + }, + { + "epoch": 2.5194174757281553, + "grad_norm": 0.3453880931807856, + "learning_rate": 5.801413108568798e-06, + "loss": 0.4264, + "step": 3633 + }, + { + "epoch": 2.520110957004161, + "grad_norm": 0.3736426631488774, + "learning_rate": 5.7990236053429025e-06, + "loss": 0.4394, + "step": 3634 + }, + { + "epoch": 2.5208044382801664, + "grad_norm": 0.4198448009764268, + "learning_rate": 5.7966339148316615e-06, + "loss": 0.4521, + "step": 3635 + }, + { + "epoch": 2.5214979195561718, + "grad_norm": 0.3961006874056316, + "learning_rate": 5.7942440375952015e-06, + "loss": 0.468, + "step": 3636 + }, + { + "epoch": 2.5221914008321775, + "grad_norm": 0.36862208361195653, + "learning_rate": 5.791853974193688e-06, + "loss": 0.3924, + "step": 3637 + }, + { + "epoch": 2.5228848821081833, + "grad_norm": 0.3970154434075573, + "learning_rate": 5.789463725187341e-06, + "loss": 0.4583, + "step": 3638 + }, + { + "epoch": 2.5235783633841886, + "grad_norm": 0.4032495629966687, + "learning_rate": 5.787073291136414e-06, + "loss": 0.5179, + "step": 3639 + }, + { + "epoch": 2.524271844660194, + "grad_norm": 0.465356869874365, + "learning_rate": 5.7846826726012076e-06, + "loss": 0.428, + "step": 3640 + }, + { + "epoch": 2.5249653259361997, + "grad_norm": 0.3575642988970886, + "learning_rate": 5.7822918701420636e-06, + "loss": 0.4404, + "step": 3641 + }, + { + "epoch": 2.5256588072122055, + "grad_norm": 0.41001725877069756, + "learning_rate": 5.779900884319372e-06, + "loss": 0.3935, + "step": 3642 + }, + { + "epoch": 2.526352288488211, + "grad_norm": 0.3740055435921111, + "learning_rate": 5.777509715693562e-06, + "loss": 0.4215, + "step": 3643 + }, + { + "epoch": 2.527045769764216, + "grad_norm": 0.3964942778362928, + "learning_rate": 5.775118364825107e-06, + "loss": 0.4949, + "step": 3644 + }, + { + "epoch": 2.527739251040222, + "grad_norm": 0.4343354336726223, + "learning_rate": 5.772726832274519e-06, + "loss": 0.4173, + "step": 3645 + }, + { + "epoch": 2.5284327323162277, + "grad_norm": 0.3953240330347288, + "learning_rate": 5.7703351186023575e-06, + "loss": 0.4559, + "step": 3646 + }, + { + "epoch": 2.529126213592233, + "grad_norm": 0.3437620302551727, + "learning_rate": 5.767943224369224e-06, + "loss": 0.4283, + "step": 3647 + }, + { + "epoch": 2.5298196948682383, + "grad_norm": 0.3600388539742395, + "learning_rate": 5.765551150135761e-06, + "loss": 0.4244, + "step": 3648 + }, + { + "epoch": 2.530513176144244, + "grad_norm": 0.378088768154225, + "learning_rate": 5.763158896462653e-06, + "loss": 0.4117, + "step": 3649 + }, + { + "epoch": 2.53120665742025, + "grad_norm": 0.3871220823092367, + "learning_rate": 5.760766463910624e-06, + "loss": 0.4257, + "step": 3650 + }, + { + "epoch": 2.531900138696255, + "grad_norm": 0.3975216481332579, + "learning_rate": 5.758373853040447e-06, + "loss": 0.4102, + "step": 3651 + }, + { + "epoch": 2.5325936199722605, + "grad_norm": 0.3545417809303507, + "learning_rate": 5.755981064412933e-06, + "loss": 0.3845, + "step": 3652 + }, + { + "epoch": 2.5332871012482663, + "grad_norm": 0.3963274566480634, + "learning_rate": 5.753588098588931e-06, + "loss": 0.4906, + "step": 3653 + }, + { + "epoch": 2.533980582524272, + "grad_norm": 0.3966180419143601, + "learning_rate": 5.751194956129337e-06, + "loss": 0.3977, + "step": 3654 + }, + { + "epoch": 2.5346740638002774, + "grad_norm": 0.33594603471503137, + "learning_rate": 5.748801637595085e-06, + "loss": 0.4099, + "step": 3655 + }, + { + "epoch": 2.5353675450762827, + "grad_norm": 0.3748737960899183, + "learning_rate": 5.746408143547153e-06, + "loss": 0.4, + "step": 3656 + }, + { + "epoch": 2.5360610263522885, + "grad_norm": 0.3624917302380124, + "learning_rate": 5.7440144745465575e-06, + "loss": 0.4139, + "step": 3657 + }, + { + "epoch": 2.5367545076282942, + "grad_norm": 0.35178626204784935, + "learning_rate": 5.7416206311543576e-06, + "loss": 0.3942, + "step": 3658 + }, + { + "epoch": 2.5374479889042996, + "grad_norm": 0.34888369461008567, + "learning_rate": 5.739226613931652e-06, + "loss": 0.4444, + "step": 3659 + }, + { + "epoch": 2.538141470180305, + "grad_norm": 0.37680720361164716, + "learning_rate": 5.736832423439583e-06, + "loss": 0.4489, + "step": 3660 + }, + { + "epoch": 2.5388349514563107, + "grad_norm": 0.3374700867181296, + "learning_rate": 5.734438060239331e-06, + "loss": 0.3839, + "step": 3661 + }, + { + "epoch": 2.5395284327323164, + "grad_norm": 0.37486206247935777, + "learning_rate": 5.732043524892115e-06, + "loss": 0.4187, + "step": 3662 + }, + { + "epoch": 2.5402219140083218, + "grad_norm": 0.3528700373676645, + "learning_rate": 5.7296488179592e-06, + "loss": 0.3598, + "step": 3663 + }, + { + "epoch": 2.540915395284327, + "grad_norm": 0.3405786207476967, + "learning_rate": 5.727253940001884e-06, + "loss": 0.4334, + "step": 3664 + }, + { + "epoch": 2.541608876560333, + "grad_norm": 0.37800274693889274, + "learning_rate": 5.724858891581515e-06, + "loss": 0.4946, + "step": 3665 + }, + { + "epoch": 2.5423023578363386, + "grad_norm": 0.6256768377791588, + "learning_rate": 5.722463673259469e-06, + "loss": 0.4233, + "step": 3666 + }, + { + "epoch": 2.542995839112344, + "grad_norm": 0.3709353796892423, + "learning_rate": 5.7200682855971715e-06, + "loss": 0.4505, + "step": 3667 + }, + { + "epoch": 2.5436893203883493, + "grad_norm": 0.3869510608020292, + "learning_rate": 5.717672729156082e-06, + "loss": 0.4752, + "step": 3668 + }, + { + "epoch": 2.544382801664355, + "grad_norm": 0.37218033040148146, + "learning_rate": 5.715277004497702e-06, + "loss": 0.4544, + "step": 3669 + }, + { + "epoch": 2.545076282940361, + "grad_norm": 0.3533052851495621, + "learning_rate": 5.712881112183575e-06, + "loss": 0.4383, + "step": 3670 + }, + { + "epoch": 2.545769764216366, + "grad_norm": 0.36007293429082987, + "learning_rate": 5.710485052775275e-06, + "loss": 0.42, + "step": 3671 + }, + { + "epoch": 2.5464632454923715, + "grad_norm": 0.34545982114615587, + "learning_rate": 5.708088826834426e-06, + "loss": 0.407, + "step": 3672 + }, + { + "epoch": 2.5471567267683772, + "grad_norm": 0.401933665753541, + "learning_rate": 5.705692434922684e-06, + "loss": 0.5003, + "step": 3673 + }, + { + "epoch": 2.547850208044383, + "grad_norm": 0.3793201709540235, + "learning_rate": 5.703295877601745e-06, + "loss": 0.4904, + "step": 3674 + }, + { + "epoch": 2.5485436893203883, + "grad_norm": 0.3471811469217267, + "learning_rate": 5.700899155433347e-06, + "loss": 0.4075, + "step": 3675 + }, + { + "epoch": 2.5492371705963937, + "grad_norm": 0.3632752848689134, + "learning_rate": 5.698502268979263e-06, + "loss": 0.4053, + "step": 3676 + }, + { + "epoch": 2.5499306518723994, + "grad_norm": 0.3611471748014215, + "learning_rate": 5.6961052188013055e-06, + "loss": 0.4524, + "step": 3677 + }, + { + "epoch": 2.550624133148405, + "grad_norm": 0.34716061061819636, + "learning_rate": 5.693708005461327e-06, + "loss": 0.4568, + "step": 3678 + }, + { + "epoch": 2.5513176144244105, + "grad_norm": 0.3651992123544262, + "learning_rate": 5.691310629521215e-06, + "loss": 0.4323, + "step": 3679 + }, + { + "epoch": 2.552011095700416, + "grad_norm": 0.3700645595991681, + "learning_rate": 5.688913091542899e-06, + "loss": 0.4512, + "step": 3680 + }, + { + "epoch": 2.5527045769764216, + "grad_norm": 0.40132072059824997, + "learning_rate": 5.686515392088344e-06, + "loss": 0.4173, + "step": 3681 + }, + { + "epoch": 2.5533980582524274, + "grad_norm": 0.416106150662046, + "learning_rate": 5.684117531719552e-06, + "loss": 0.4643, + "step": 3682 + }, + { + "epoch": 2.5540915395284327, + "grad_norm": 0.33317813353684955, + "learning_rate": 5.681719510998565e-06, + "loss": 0.3805, + "step": 3683 + }, + { + "epoch": 2.554785020804438, + "grad_norm": 0.35243769236231487, + "learning_rate": 5.6793213304874624e-06, + "loss": 0.368, + "step": 3684 + }, + { + "epoch": 2.555478502080444, + "grad_norm": 0.3344743557379927, + "learning_rate": 5.67692299074836e-06, + "loss": 0.3887, + "step": 3685 + }, + { + "epoch": 2.5561719833564496, + "grad_norm": 0.3365237283325583, + "learning_rate": 5.674524492343411e-06, + "loss": 0.4553, + "step": 3686 + }, + { + "epoch": 2.556865464632455, + "grad_norm": 0.3610517869545695, + "learning_rate": 5.672125835834805e-06, + "loss": 0.4153, + "step": 3687 + }, + { + "epoch": 2.5575589459084602, + "grad_norm": 0.39418427644436477, + "learning_rate": 5.669727021784772e-06, + "loss": 0.4177, + "step": 3688 + }, + { + "epoch": 2.558252427184466, + "grad_norm": 0.38356523634243245, + "learning_rate": 5.667328050755576e-06, + "loss": 0.4408, + "step": 3689 + }, + { + "epoch": 2.5589459084604718, + "grad_norm": 0.37485358936035373, + "learning_rate": 5.664928923309518e-06, + "loss": 0.4429, + "step": 3690 + }, + { + "epoch": 2.559639389736477, + "grad_norm": 0.36326370801839963, + "learning_rate": 5.662529640008933e-06, + "loss": 0.4442, + "step": 3691 + }, + { + "epoch": 2.5603328710124824, + "grad_norm": 0.3588050611935666, + "learning_rate": 5.660130201416203e-06, + "loss": 0.4018, + "step": 3692 + }, + { + "epoch": 2.561026352288488, + "grad_norm": 0.3808139714513288, + "learning_rate": 5.657730608093732e-06, + "loss": 0.4372, + "step": 3693 + }, + { + "epoch": 2.561719833564494, + "grad_norm": 0.4158124410644996, + "learning_rate": 5.655330860603971e-06, + "loss": 0.4114, + "step": 3694 + }, + { + "epoch": 2.5624133148404993, + "grad_norm": 0.3726410234351283, + "learning_rate": 5.652930959509402e-06, + "loss": 0.4471, + "step": 3695 + }, + { + "epoch": 2.5631067961165046, + "grad_norm": 0.4089846857911151, + "learning_rate": 5.650530905372545e-06, + "loss": 0.4298, + "step": 3696 + }, + { + "epoch": 2.5638002773925104, + "grad_norm": 0.38888347350128055, + "learning_rate": 5.648130698755954e-06, + "loss": 0.4317, + "step": 3697 + }, + { + "epoch": 2.564493758668516, + "grad_norm": 0.4227994034076653, + "learning_rate": 5.645730340222224e-06, + "loss": 0.4903, + "step": 3698 + }, + { + "epoch": 2.5651872399445215, + "grad_norm": 0.3711593143868576, + "learning_rate": 5.6433298303339764e-06, + "loss": 0.4298, + "step": 3699 + }, + { + "epoch": 2.565880721220527, + "grad_norm": 0.37733931107737134, + "learning_rate": 5.640929169653876e-06, + "loss": 0.4166, + "step": 3700 + }, + { + "epoch": 2.5665742024965326, + "grad_norm": 0.35997706656355766, + "learning_rate": 5.638528358744621e-06, + "loss": 0.3856, + "step": 3701 + }, + { + "epoch": 2.5672676837725383, + "grad_norm": 0.37300285393631594, + "learning_rate": 5.636127398168942e-06, + "loss": 0.4392, + "step": 3702 + }, + { + "epoch": 2.5679611650485437, + "grad_norm": 0.36269776826786154, + "learning_rate": 5.633726288489609e-06, + "loss": 0.4053, + "step": 3703 + }, + { + "epoch": 2.568654646324549, + "grad_norm": 0.43047705361189964, + "learning_rate": 5.631325030269422e-06, + "loss": 0.4553, + "step": 3704 + }, + { + "epoch": 2.5693481276005548, + "grad_norm": 0.38034380170016574, + "learning_rate": 5.628923624071222e-06, + "loss": 0.4124, + "step": 3705 + }, + { + "epoch": 2.5700416088765605, + "grad_norm": 0.3866072263745971, + "learning_rate": 5.626522070457879e-06, + "loss": 0.3747, + "step": 3706 + }, + { + "epoch": 2.570735090152566, + "grad_norm": 0.3873848514008934, + "learning_rate": 5.6241203699923e-06, + "loss": 0.4257, + "step": 3707 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.41228222972864187, + "learning_rate": 5.621718523237427e-06, + "loss": 0.4273, + "step": 3708 + }, + { + "epoch": 2.572122052704577, + "grad_norm": 0.3971077971750073, + "learning_rate": 5.619316530756234e-06, + "loss": 0.3981, + "step": 3709 + }, + { + "epoch": 2.5728155339805827, + "grad_norm": 0.381797077329742, + "learning_rate": 5.616914393111732e-06, + "loss": 0.4329, + "step": 3710 + }, + { + "epoch": 2.573509015256588, + "grad_norm": 0.3660546560833639, + "learning_rate": 5.614512110866963e-06, + "loss": 0.3804, + "step": 3711 + }, + { + "epoch": 2.5742024965325934, + "grad_norm": 0.39255317148981794, + "learning_rate": 5.612109684585007e-06, + "loss": 0.4257, + "step": 3712 + }, + { + "epoch": 2.574895977808599, + "grad_norm": 0.3773999077983148, + "learning_rate": 5.6097071148289725e-06, + "loss": 0.4213, + "step": 3713 + }, + { + "epoch": 2.575589459084605, + "grad_norm": 0.3768246769612683, + "learning_rate": 5.607304402162008e-06, + "loss": 0.4416, + "step": 3714 + }, + { + "epoch": 2.5762829403606102, + "grad_norm": 0.36375337821157094, + "learning_rate": 5.604901547147289e-06, + "loss": 0.41, + "step": 3715 + }, + { + "epoch": 2.5769764216366156, + "grad_norm": 0.360967608892819, + "learning_rate": 5.602498550348028e-06, + "loss": 0.4571, + "step": 3716 + }, + { + "epoch": 2.5776699029126213, + "grad_norm": 0.3411486721155386, + "learning_rate": 5.600095412327471e-06, + "loss": 0.3802, + "step": 3717 + }, + { + "epoch": 2.578363384188627, + "grad_norm": 0.39022463140534003, + "learning_rate": 5.597692133648894e-06, + "loss": 0.4289, + "step": 3718 + }, + { + "epoch": 2.5790568654646324, + "grad_norm": 0.37037153721869426, + "learning_rate": 5.595288714875612e-06, + "loss": 0.3758, + "step": 3719 + }, + { + "epoch": 2.5797503467406377, + "grad_norm": 0.35030185660699426, + "learning_rate": 5.592885156570964e-06, + "loss": 0.458, + "step": 3720 + }, + { + "epoch": 2.5804438280166435, + "grad_norm": 0.35810883640075986, + "learning_rate": 5.590481459298332e-06, + "loss": 0.454, + "step": 3721 + }, + { + "epoch": 2.5811373092926493, + "grad_norm": 0.3449845728933635, + "learning_rate": 5.588077623621119e-06, + "loss": 0.4534, + "step": 3722 + }, + { + "epoch": 2.5818307905686546, + "grad_norm": 0.33558412575658775, + "learning_rate": 5.585673650102772e-06, + "loss": 0.406, + "step": 3723 + }, + { + "epoch": 2.58252427184466, + "grad_norm": 0.35776352115379173, + "learning_rate": 5.583269539306762e-06, + "loss": 0.4254, + "step": 3724 + }, + { + "epoch": 2.5832177531206657, + "grad_norm": 0.3671552044271029, + "learning_rate": 5.580865291796598e-06, + "loss": 0.4048, + "step": 3725 + }, + { + "epoch": 2.5839112343966715, + "grad_norm": 0.3441193519045531, + "learning_rate": 5.578460908135815e-06, + "loss": 0.4394, + "step": 3726 + }, + { + "epoch": 2.584604715672677, + "grad_norm": 0.42984264415314943, + "learning_rate": 5.576056388887985e-06, + "loss": 0.4469, + "step": 3727 + }, + { + "epoch": 2.585298196948682, + "grad_norm": 0.3770052828728155, + "learning_rate": 5.57365173461671e-06, + "loss": 0.4379, + "step": 3728 + }, + { + "epoch": 2.585991678224688, + "grad_norm": 0.35548290003551913, + "learning_rate": 5.5712469458856226e-06, + "loss": 0.4313, + "step": 3729 + }, + { + "epoch": 2.5866851595006937, + "grad_norm": 0.38674569287024546, + "learning_rate": 5.568842023258389e-06, + "loss": 0.439, + "step": 3730 + }, + { + "epoch": 2.587378640776699, + "grad_norm": 0.40608562893752964, + "learning_rate": 5.5664369672987025e-06, + "loss": 0.4199, + "step": 3731 + }, + { + "epoch": 2.5880721220527043, + "grad_norm": 0.3967768646832322, + "learning_rate": 5.564031778570293e-06, + "loss": 0.4563, + "step": 3732 + }, + { + "epoch": 2.58876560332871, + "grad_norm": 0.6433857519437939, + "learning_rate": 5.561626457636923e-06, + "loss": 0.419, + "step": 3733 + }, + { + "epoch": 2.589459084604716, + "grad_norm": 0.3345315937284391, + "learning_rate": 5.559221005062377e-06, + "loss": 0.4125, + "step": 3734 + }, + { + "epoch": 2.590152565880721, + "grad_norm": 0.36153949866576096, + "learning_rate": 5.556815421410479e-06, + "loss": 0.3792, + "step": 3735 + }, + { + "epoch": 2.5908460471567265, + "grad_norm": 0.3656600591312674, + "learning_rate": 5.554409707245076e-06, + "loss": 0.3912, + "step": 3736 + }, + { + "epoch": 2.5915395284327323, + "grad_norm": 0.34484330304303534, + "learning_rate": 5.552003863130053e-06, + "loss": 0.4442, + "step": 3737 + }, + { + "epoch": 2.592233009708738, + "grad_norm": 0.3951634673635669, + "learning_rate": 5.549597889629325e-06, + "loss": 0.4141, + "step": 3738 + }, + { + "epoch": 2.5929264909847434, + "grad_norm": 0.34791603379098446, + "learning_rate": 5.54719178730683e-06, + "loss": 0.4432, + "step": 3739 + }, + { + "epoch": 2.5936199722607487, + "grad_norm": 0.4019635224982042, + "learning_rate": 5.544785556726544e-06, + "loss": 0.4255, + "step": 3740 + }, + { + "epoch": 2.5943134535367545, + "grad_norm": 0.3527970462135893, + "learning_rate": 5.542379198452468e-06, + "loss": 0.4089, + "step": 3741 + }, + { + "epoch": 2.5950069348127602, + "grad_norm": 0.37691743428061764, + "learning_rate": 5.5399727130486365e-06, + "loss": 0.432, + "step": 3742 + }, + { + "epoch": 2.5957004160887656, + "grad_norm": 0.3596247988540151, + "learning_rate": 5.537566101079113e-06, + "loss": 0.4501, + "step": 3743 + }, + { + "epoch": 2.596393897364771, + "grad_norm": 0.3729150713327292, + "learning_rate": 5.535159363107986e-06, + "loss": 0.4464, + "step": 3744 + }, + { + "epoch": 2.5970873786407767, + "grad_norm": 0.3860713401631495, + "learning_rate": 5.532752499699381e-06, + "loss": 0.4373, + "step": 3745 + }, + { + "epoch": 2.5977808599167824, + "grad_norm": 0.3716020769185054, + "learning_rate": 5.53034551141745e-06, + "loss": 0.3894, + "step": 3746 + }, + { + "epoch": 2.5984743411927878, + "grad_norm": 0.45183884790407536, + "learning_rate": 5.527938398826371e-06, + "loss": 0.4719, + "step": 3747 + }, + { + "epoch": 2.599167822468793, + "grad_norm": 0.33219206342278024, + "learning_rate": 5.525531162490354e-06, + "loss": 0.3978, + "step": 3748 + }, + { + "epoch": 2.599861303744799, + "grad_norm": 0.3504710087195076, + "learning_rate": 5.523123802973639e-06, + "loss": 0.4453, + "step": 3749 + }, + { + "epoch": 2.6005547850208046, + "grad_norm": 0.37587629823776186, + "learning_rate": 5.520716320840495e-06, + "loss": 0.3941, + "step": 3750 + }, + { + "epoch": 2.60124826629681, + "grad_norm": 0.6691127730153275, + "learning_rate": 5.518308716655216e-06, + "loss": 0.3909, + "step": 3751 + }, + { + "epoch": 2.6019417475728153, + "grad_norm": 0.366401235771044, + "learning_rate": 5.515900990982125e-06, + "loss": 0.4781, + "step": 3752 + }, + { + "epoch": 2.602635228848821, + "grad_norm": 0.3688956077443146, + "learning_rate": 5.51349314438558e-06, + "loss": 0.4083, + "step": 3753 + }, + { + "epoch": 2.603328710124827, + "grad_norm": 0.35856846365208345, + "learning_rate": 5.511085177429961e-06, + "loss": 0.4293, + "step": 3754 + }, + { + "epoch": 2.604022191400832, + "grad_norm": 0.3751803405536985, + "learning_rate": 5.508677090679678e-06, + "loss": 0.4318, + "step": 3755 + }, + { + "epoch": 2.6047156726768375, + "grad_norm": 0.36795464504698244, + "learning_rate": 5.5062688846991684e-06, + "loss": 0.4388, + "step": 3756 + }, + { + "epoch": 2.6054091539528432, + "grad_norm": 0.3297011393946642, + "learning_rate": 5.503860560052898e-06, + "loss": 0.4121, + "step": 3757 + }, + { + "epoch": 2.606102635228849, + "grad_norm": 0.3498062100571866, + "learning_rate": 5.501452117305363e-06, + "loss": 0.4351, + "step": 3758 + }, + { + "epoch": 2.6067961165048543, + "grad_norm": 0.3759610636124447, + "learning_rate": 5.499043557021083e-06, + "loss": 0.4001, + "step": 3759 + }, + { + "epoch": 2.6074895977808596, + "grad_norm": 0.361913735062871, + "learning_rate": 5.496634879764607e-06, + "loss": 0.4162, + "step": 3760 + }, + { + "epoch": 2.6081830790568654, + "grad_norm": 0.60258597968192, + "learning_rate": 5.494226086100513e-06, + "loss": 0.4537, + "step": 3761 + }, + { + "epoch": 2.608876560332871, + "grad_norm": 0.37847753758606156, + "learning_rate": 5.491817176593402e-06, + "loss": 0.4678, + "step": 3762 + }, + { + "epoch": 2.6095700416088765, + "grad_norm": 0.3625587244406342, + "learning_rate": 5.489408151807908e-06, + "loss": 0.4089, + "step": 3763 + }, + { + "epoch": 2.610263522884882, + "grad_norm": 0.4700096526748301, + "learning_rate": 5.486999012308688e-06, + "loss": 0.4449, + "step": 3764 + }, + { + "epoch": 2.6109570041608876, + "grad_norm": 0.40812257849765043, + "learning_rate": 5.484589758660426e-06, + "loss": 0.4594, + "step": 3765 + }, + { + "epoch": 2.6116504854368934, + "grad_norm": 0.34209366412052367, + "learning_rate": 5.482180391427834e-06, + "loss": 0.3939, + "step": 3766 + }, + { + "epoch": 2.6123439667128987, + "grad_norm": 0.3800270302095452, + "learning_rate": 5.479770911175649e-06, + "loss": 0.4419, + "step": 3767 + }, + { + "epoch": 2.613037447988904, + "grad_norm": 0.35056882827820857, + "learning_rate": 5.4773613184686395e-06, + "loss": 0.4048, + "step": 3768 + }, + { + "epoch": 2.61373092926491, + "grad_norm": 0.34360538237402993, + "learning_rate": 5.474951613871593e-06, + "loss": 0.3936, + "step": 3769 + }, + { + "epoch": 2.6144244105409156, + "grad_norm": 0.3568445867350128, + "learning_rate": 5.472541797949329e-06, + "loss": 0.4234, + "step": 3770 + }, + { + "epoch": 2.615117891816921, + "grad_norm": 0.3870140610785522, + "learning_rate": 5.470131871266687e-06, + "loss": 0.4686, + "step": 3771 + }, + { + "epoch": 2.615811373092926, + "grad_norm": 0.3850573844208301, + "learning_rate": 5.467721834388543e-06, + "loss": 0.4577, + "step": 3772 + }, + { + "epoch": 2.616504854368932, + "grad_norm": 0.361784609782144, + "learning_rate": 5.465311687879785e-06, + "loss": 0.44, + "step": 3773 + }, + { + "epoch": 2.6171983356449378, + "grad_norm": 0.3682537385488214, + "learning_rate": 5.46290143230534e-06, + "loss": 0.4247, + "step": 3774 + }, + { + "epoch": 2.617891816920943, + "grad_norm": 0.37963767194274, + "learning_rate": 5.460491068230151e-06, + "loss": 0.4396, + "step": 3775 + }, + { + "epoch": 2.6185852981969484, + "grad_norm": 0.3635303131208463, + "learning_rate": 5.45808059621919e-06, + "loss": 0.3716, + "step": 3776 + }, + { + "epoch": 2.619278779472954, + "grad_norm": 0.3518733468155541, + "learning_rate": 5.4556700168374545e-06, + "loss": 0.4338, + "step": 3777 + }, + { + "epoch": 2.61997226074896, + "grad_norm": 0.3618489564081225, + "learning_rate": 5.453259330649968e-06, + "loss": 0.4228, + "step": 3778 + }, + { + "epoch": 2.6206657420249653, + "grad_norm": 0.3787554840616143, + "learning_rate": 5.450848538221778e-06, + "loss": 0.4473, + "step": 3779 + }, + { + "epoch": 2.6213592233009706, + "grad_norm": 0.4644117103205756, + "learning_rate": 5.448437640117954e-06, + "loss": 0.4532, + "step": 3780 + }, + { + "epoch": 2.6220527045769764, + "grad_norm": 0.3886094686504835, + "learning_rate": 5.446026636903597e-06, + "loss": 0.4161, + "step": 3781 + }, + { + "epoch": 2.622746185852982, + "grad_norm": 0.3534053504637415, + "learning_rate": 5.443615529143824e-06, + "loss": 0.4568, + "step": 3782 + }, + { + "epoch": 2.6234396671289875, + "grad_norm": 0.42493276447793155, + "learning_rate": 5.441204317403786e-06, + "loss": 0.4519, + "step": 3783 + }, + { + "epoch": 2.624133148404993, + "grad_norm": 0.36455551842590017, + "learning_rate": 5.43879300224865e-06, + "loss": 0.4692, + "step": 3784 + }, + { + "epoch": 2.6248266296809986, + "grad_norm": 0.36011237410953445, + "learning_rate": 5.436381584243612e-06, + "loss": 0.4225, + "step": 3785 + }, + { + "epoch": 2.6255201109570043, + "grad_norm": 0.3334683283562822, + "learning_rate": 5.4339700639538916e-06, + "loss": 0.4399, + "step": 3786 + }, + { + "epoch": 2.6262135922330097, + "grad_norm": 0.4146399380578723, + "learning_rate": 5.431558441944731e-06, + "loss": 0.447, + "step": 3787 + }, + { + "epoch": 2.6269070735090154, + "grad_norm": 0.4636544598493252, + "learning_rate": 5.429146718781399e-06, + "loss": 0.4003, + "step": 3788 + }, + { + "epoch": 2.6276005547850207, + "grad_norm": 0.37412138567330283, + "learning_rate": 5.426734895029181e-06, + "loss": 0.4417, + "step": 3789 + }, + { + "epoch": 2.6282940360610265, + "grad_norm": 0.3722010929181449, + "learning_rate": 5.424322971253395e-06, + "loss": 0.4565, + "step": 3790 + }, + { + "epoch": 2.628987517337032, + "grad_norm": 0.4112008967241748, + "learning_rate": 5.4219109480193785e-06, + "loss": 0.4685, + "step": 3791 + }, + { + "epoch": 2.6296809986130376, + "grad_norm": 0.37991973696376485, + "learning_rate": 5.419498825892492e-06, + "loss": 0.4443, + "step": 3792 + }, + { + "epoch": 2.630374479889043, + "grad_norm": 0.3991715151825618, + "learning_rate": 5.417086605438117e-06, + "loss": 0.4896, + "step": 3793 + }, + { + "epoch": 2.6310679611650487, + "grad_norm": 0.3647426870348548, + "learning_rate": 5.414674287221663e-06, + "loss": 0.4505, + "step": 3794 + }, + { + "epoch": 2.631761442441054, + "grad_norm": 0.3845405016986538, + "learning_rate": 5.412261871808559e-06, + "loss": 0.4155, + "step": 3795 + }, + { + "epoch": 2.63245492371706, + "grad_norm": 0.3631268322169773, + "learning_rate": 5.4098493597642595e-06, + "loss": 0.4128, + "step": 3796 + }, + { + "epoch": 2.633148404993065, + "grad_norm": 0.7856996704831801, + "learning_rate": 5.407436751654238e-06, + "loss": 0.4495, + "step": 3797 + }, + { + "epoch": 2.633841886269071, + "grad_norm": 0.4233322253719357, + "learning_rate": 5.4050240480439906e-06, + "loss": 0.4176, + "step": 3798 + }, + { + "epoch": 2.6345353675450762, + "grad_norm": 0.38586780141610955, + "learning_rate": 5.402611249499042e-06, + "loss": 0.4427, + "step": 3799 + }, + { + "epoch": 2.635228848821082, + "grad_norm": 0.3778446149227224, + "learning_rate": 5.400198356584932e-06, + "loss": 0.4388, + "step": 3800 + }, + { + "epoch": 2.6359223300970873, + "grad_norm": 0.36889278078010745, + "learning_rate": 5.397785369867227e-06, + "loss": 0.4256, + "step": 3801 + }, + { + "epoch": 2.636615811373093, + "grad_norm": 0.3465106412138196, + "learning_rate": 5.395372289911509e-06, + "loss": 0.4158, + "step": 3802 + }, + { + "epoch": 2.6373092926490984, + "grad_norm": 0.376075046648369, + "learning_rate": 5.392959117283391e-06, + "loss": 0.4359, + "step": 3803 + }, + { + "epoch": 2.638002773925104, + "grad_norm": 0.3903036427794065, + "learning_rate": 5.390545852548502e-06, + "loss": 0.4346, + "step": 3804 + }, + { + "epoch": 2.6386962552011095, + "grad_norm": 0.3293396104525081, + "learning_rate": 5.388132496272493e-06, + "loss": 0.3703, + "step": 3805 + }, + { + "epoch": 2.6393897364771153, + "grad_norm": 0.42174816417885386, + "learning_rate": 5.3857190490210385e-06, + "loss": 0.375, + "step": 3806 + }, + { + "epoch": 2.6400832177531206, + "grad_norm": 0.4618785646157206, + "learning_rate": 5.383305511359832e-06, + "loss": 0.4177, + "step": 3807 + }, + { + "epoch": 2.6407766990291264, + "grad_norm": 0.3694977352082934, + "learning_rate": 5.380891883854591e-06, + "loss": 0.4109, + "step": 3808 + }, + { + "epoch": 2.6414701803051317, + "grad_norm": 0.3592035568808262, + "learning_rate": 5.3784781670710495e-06, + "loss": 0.4085, + "step": 3809 + }, + { + "epoch": 2.6421636615811375, + "grad_norm": 0.43947745945284844, + "learning_rate": 5.3760643615749675e-06, + "loss": 0.4321, + "step": 3810 + }, + { + "epoch": 2.642857142857143, + "grad_norm": 0.32092024034246674, + "learning_rate": 5.373650467932122e-06, + "loss": 0.3822, + "step": 3811 + }, + { + "epoch": 2.6435506241331486, + "grad_norm": 0.35861530565497973, + "learning_rate": 5.3712364867083134e-06, + "loss": 0.394, + "step": 3812 + }, + { + "epoch": 2.644244105409154, + "grad_norm": 0.378026435324307, + "learning_rate": 5.368822418469361e-06, + "loss": 0.4177, + "step": 3813 + }, + { + "epoch": 2.6449375866851597, + "grad_norm": 0.3584373070708219, + "learning_rate": 5.366408263781104e-06, + "loss": 0.375, + "step": 3814 + }, + { + "epoch": 2.645631067961165, + "grad_norm": 0.41986316255367523, + "learning_rate": 5.363994023209404e-06, + "loss": 0.4616, + "step": 3815 + }, + { + "epoch": 2.6463245492371708, + "grad_norm": 0.3451241161415589, + "learning_rate": 5.361579697320142e-06, + "loss": 0.4281, + "step": 3816 + }, + { + "epoch": 2.647018030513176, + "grad_norm": 1.4369867788114814, + "learning_rate": 5.359165286679218e-06, + "loss": 0.3929, + "step": 3817 + }, + { + "epoch": 2.647711511789182, + "grad_norm": 0.3895089837798235, + "learning_rate": 5.35675079185255e-06, + "loss": 0.4415, + "step": 3818 + }, + { + "epoch": 2.648404993065187, + "grad_norm": 0.3626656054242166, + "learning_rate": 5.354336213406082e-06, + "loss": 0.4074, + "step": 3819 + }, + { + "epoch": 2.649098474341193, + "grad_norm": 0.37542529805220165, + "learning_rate": 5.351921551905771e-06, + "loss": 0.4433, + "step": 3820 + }, + { + "epoch": 2.6497919556171983, + "grad_norm": 0.36070413357304115, + "learning_rate": 5.349506807917596e-06, + "loss": 0.4098, + "step": 3821 + }, + { + "epoch": 2.650485436893204, + "grad_norm": 0.3708055315327396, + "learning_rate": 5.347091982007557e-06, + "loss": 0.4061, + "step": 3822 + }, + { + "epoch": 2.6511789181692094, + "grad_norm": 0.35116147704308043, + "learning_rate": 5.344677074741672e-06, + "loss": 0.4528, + "step": 3823 + }, + { + "epoch": 2.651872399445215, + "grad_norm": 0.36822519637537554, + "learning_rate": 5.342262086685978e-06, + "loss": 0.4577, + "step": 3824 + }, + { + "epoch": 2.6525658807212205, + "grad_norm": 0.33138172961978474, + "learning_rate": 5.339847018406528e-06, + "loss": 0.4056, + "step": 3825 + }, + { + "epoch": 2.6532593619972262, + "grad_norm": 0.3952774135535713, + "learning_rate": 5.337431870469398e-06, + "loss": 0.4416, + "step": 3826 + }, + { + "epoch": 2.6539528432732316, + "grad_norm": 0.37808939390975305, + "learning_rate": 5.335016643440682e-06, + "loss": 0.4733, + "step": 3827 + }, + { + "epoch": 2.6546463245492373, + "grad_norm": 0.3883742534975216, + "learning_rate": 5.332601337886491e-06, + "loss": 0.4753, + "step": 3828 + }, + { + "epoch": 2.6553398058252426, + "grad_norm": 0.3658034489543328, + "learning_rate": 5.330185954372955e-06, + "loss": 0.4091, + "step": 3829 + }, + { + "epoch": 2.6560332871012484, + "grad_norm": 0.352242607905625, + "learning_rate": 5.327770493466222e-06, + "loss": 0.4351, + "step": 3830 + }, + { + "epoch": 2.6567267683772537, + "grad_norm": 0.38755288958224077, + "learning_rate": 5.325354955732459e-06, + "loss": 0.4242, + "step": 3831 + }, + { + "epoch": 2.6574202496532595, + "grad_norm": 0.4128883676447061, + "learning_rate": 5.322939341737853e-06, + "loss": 0.4599, + "step": 3832 + }, + { + "epoch": 2.658113730929265, + "grad_norm": 0.3446035011512247, + "learning_rate": 5.320523652048603e-06, + "loss": 0.3813, + "step": 3833 + }, + { + "epoch": 2.6588072122052706, + "grad_norm": 0.44146869732628513, + "learning_rate": 5.318107887230929e-06, + "loss": 0.4327, + "step": 3834 + }, + { + "epoch": 2.659500693481276, + "grad_norm": 0.382938696230218, + "learning_rate": 5.31569204785107e-06, + "loss": 0.4394, + "step": 3835 + }, + { + "epoch": 2.6601941747572817, + "grad_norm": 0.30243261300851093, + "learning_rate": 5.3132761344752825e-06, + "loss": 0.3379, + "step": 3836 + }, + { + "epoch": 2.660887656033287, + "grad_norm": 0.35189179104860097, + "learning_rate": 5.3108601476698385e-06, + "loss": 0.4075, + "step": 3837 + }, + { + "epoch": 2.661581137309293, + "grad_norm": 0.35557402150217593, + "learning_rate": 5.308444088001027e-06, + "loss": 0.4013, + "step": 3838 + }, + { + "epoch": 2.662274618585298, + "grad_norm": 0.3751145373263566, + "learning_rate": 5.3060279560351534e-06, + "loss": 0.3767, + "step": 3839 + }, + { + "epoch": 2.662968099861304, + "grad_norm": 0.40124517323670383, + "learning_rate": 5.303611752338545e-06, + "loss": 0.4441, + "step": 3840 + }, + { + "epoch": 2.663661581137309, + "grad_norm": 0.37029711674283333, + "learning_rate": 5.301195477477541e-06, + "loss": 0.4523, + "step": 3841 + }, + { + "epoch": 2.664355062413315, + "grad_norm": 0.3457799414696006, + "learning_rate": 5.298779132018498e-06, + "loss": 0.3814, + "step": 3842 + }, + { + "epoch": 2.6650485436893203, + "grad_norm": 0.4467982075906699, + "learning_rate": 5.2963627165277884e-06, + "loss": 0.5013, + "step": 3843 + }, + { + "epoch": 2.665742024965326, + "grad_norm": 0.3612903049099139, + "learning_rate": 5.293946231571806e-06, + "loss": 0.4286, + "step": 3844 + }, + { + "epoch": 2.6664355062413314, + "grad_norm": 0.42096380664232824, + "learning_rate": 5.291529677716957e-06, + "loss": 0.4524, + "step": 3845 + }, + { + "epoch": 2.667128987517337, + "grad_norm": 0.37842426275360985, + "learning_rate": 5.289113055529662e-06, + "loss": 0.4953, + "step": 3846 + }, + { + "epoch": 2.6678224687933425, + "grad_norm": 0.4527863340602083, + "learning_rate": 5.2866963655763585e-06, + "loss": 0.4964, + "step": 3847 + }, + { + "epoch": 2.6685159500693483, + "grad_norm": 0.41038815945225465, + "learning_rate": 5.2842796084235056e-06, + "loss": 0.4955, + "step": 3848 + }, + { + "epoch": 2.6692094313453536, + "grad_norm": 0.6243938111017512, + "learning_rate": 5.281862784637572e-06, + "loss": 0.4459, + "step": 3849 + }, + { + "epoch": 2.6699029126213594, + "grad_norm": 0.48529753597167835, + "learning_rate": 5.279445894785042e-06, + "loss": 0.4218, + "step": 3850 + }, + { + "epoch": 2.6705963938973647, + "grad_norm": 0.39357500392050937, + "learning_rate": 5.277028939432417e-06, + "loss": 0.4677, + "step": 3851 + }, + { + "epoch": 2.6712898751733705, + "grad_norm": 0.4158515313337612, + "learning_rate": 5.274611919146216e-06, + "loss": 0.4871, + "step": 3852 + }, + { + "epoch": 2.671983356449376, + "grad_norm": 0.3648318014578259, + "learning_rate": 5.27219483449297e-06, + "loss": 0.4444, + "step": 3853 + }, + { + "epoch": 2.6726768377253816, + "grad_norm": 0.39245061080790583, + "learning_rate": 5.269777686039226e-06, + "loss": 0.4346, + "step": 3854 + }, + { + "epoch": 2.673370319001387, + "grad_norm": 0.37761298304059504, + "learning_rate": 5.267360474351546e-06, + "loss": 0.51, + "step": 3855 + }, + { + "epoch": 2.6740638002773927, + "grad_norm": 0.3276778860246595, + "learning_rate": 5.264943199996506e-06, + "loss": 0.3985, + "step": 3856 + }, + { + "epoch": 2.674757281553398, + "grad_norm": 0.400475805899134, + "learning_rate": 5.2625258635407004e-06, + "loss": 0.4936, + "step": 3857 + }, + { + "epoch": 2.6754507628294038, + "grad_norm": 0.38231088685546327, + "learning_rate": 5.2601084655507336e-06, + "loss": 0.4338, + "step": 3858 + }, + { + "epoch": 2.676144244105409, + "grad_norm": 0.3565492326228381, + "learning_rate": 5.2576910065932266e-06, + "loss": 0.4004, + "step": 3859 + }, + { + "epoch": 2.676837725381415, + "grad_norm": 0.3823889598181103, + "learning_rate": 5.255273487234813e-06, + "loss": 0.4451, + "step": 3860 + }, + { + "epoch": 2.67753120665742, + "grad_norm": 0.353468165004134, + "learning_rate": 5.252855908042142e-06, + "loss": 0.4164, + "step": 3861 + }, + { + "epoch": 2.678224687933426, + "grad_norm": 0.3372804452217276, + "learning_rate": 5.25043826958188e-06, + "loss": 0.4089, + "step": 3862 + }, + { + "epoch": 2.6789181692094313, + "grad_norm": 0.5830089257063927, + "learning_rate": 5.248020572420699e-06, + "loss": 0.4095, + "step": 3863 + }, + { + "epoch": 2.679611650485437, + "grad_norm": 0.4387228300901809, + "learning_rate": 5.245602817125294e-06, + "loss": 0.4846, + "step": 3864 + }, + { + "epoch": 2.6803051317614424, + "grad_norm": 0.444289226140612, + "learning_rate": 5.243185004262365e-06, + "loss": 0.3794, + "step": 3865 + }, + { + "epoch": 2.680998613037448, + "grad_norm": 0.33049029406589236, + "learning_rate": 5.240767134398634e-06, + "loss": 0.3988, + "step": 3866 + }, + { + "epoch": 2.6816920943134535, + "grad_norm": 0.41979607630867255, + "learning_rate": 5.238349208100832e-06, + "loss": 0.4828, + "step": 3867 + }, + { + "epoch": 2.6823855755894592, + "grad_norm": 0.3847292273392992, + "learning_rate": 5.235931225935699e-06, + "loss": 0.4333, + "step": 3868 + }, + { + "epoch": 2.6830790568654646, + "grad_norm": 0.36875801078235587, + "learning_rate": 5.2335131884699965e-06, + "loss": 0.4368, + "step": 3869 + }, + { + "epoch": 2.6837725381414703, + "grad_norm": 0.3735920475953635, + "learning_rate": 5.231095096270493e-06, + "loss": 0.3928, + "step": 3870 + }, + { + "epoch": 2.6844660194174756, + "grad_norm": 0.4401761489296933, + "learning_rate": 5.228676949903974e-06, + "loss": 0.4658, + "step": 3871 + }, + { + "epoch": 2.6851595006934814, + "grad_norm": 0.40798354428187616, + "learning_rate": 5.226258749937232e-06, + "loss": 0.4463, + "step": 3872 + }, + { + "epoch": 2.6858529819694867, + "grad_norm": 0.36124373823914524, + "learning_rate": 5.2238404969370795e-06, + "loss": 0.3917, + "step": 3873 + }, + { + "epoch": 2.6865464632454925, + "grad_norm": 0.39546396662256333, + "learning_rate": 5.221422191470335e-06, + "loss": 0.4414, + "step": 3874 + }, + { + "epoch": 2.687239944521498, + "grad_norm": 0.35744152792064293, + "learning_rate": 5.2190038341038315e-06, + "loss": 0.3899, + "step": 3875 + }, + { + "epoch": 2.6879334257975036, + "grad_norm": 0.3781062902299069, + "learning_rate": 5.216585425404417e-06, + "loss": 0.4787, + "step": 3876 + }, + { + "epoch": 2.688626907073509, + "grad_norm": 0.4247252277253531, + "learning_rate": 5.214166965938947e-06, + "loss": 0.402, + "step": 3877 + }, + { + "epoch": 2.6893203883495147, + "grad_norm": 0.35503088527907695, + "learning_rate": 5.211748456274291e-06, + "loss": 0.356, + "step": 3878 + }, + { + "epoch": 2.69001386962552, + "grad_norm": 0.3785257962903678, + "learning_rate": 5.20932989697733e-06, + "loss": 0.434, + "step": 3879 + }, + { + "epoch": 2.690707350901526, + "grad_norm": 0.39241219128103705, + "learning_rate": 5.2069112886149564e-06, + "loss": 0.478, + "step": 3880 + }, + { + "epoch": 2.691400832177531, + "grad_norm": 0.33931724990554285, + "learning_rate": 5.204492631754078e-06, + "loss": 0.4215, + "step": 3881 + }, + { + "epoch": 2.692094313453537, + "grad_norm": 1.4503540627682872, + "learning_rate": 5.202073926961606e-06, + "loss": 0.4134, + "step": 3882 + }, + { + "epoch": 2.692787794729542, + "grad_norm": 0.37210848766355686, + "learning_rate": 5.1996551748044685e-06, + "loss": 0.4377, + "step": 3883 + }, + { + "epoch": 2.693481276005548, + "grad_norm": 0.37487556806381966, + "learning_rate": 5.197236375849604e-06, + "loss": 0.3642, + "step": 3884 + }, + { + "epoch": 2.6941747572815533, + "grad_norm": 0.3511259028253951, + "learning_rate": 5.1948175306639625e-06, + "loss": 0.4125, + "step": 3885 + }, + { + "epoch": 2.694868238557559, + "grad_norm": 0.3728669647776683, + "learning_rate": 5.192398639814503e-06, + "loss": 0.4129, + "step": 3886 + }, + { + "epoch": 2.6955617198335644, + "grad_norm": 0.37291981358264475, + "learning_rate": 5.189979703868195e-06, + "loss": 0.4068, + "step": 3887 + }, + { + "epoch": 2.69625520110957, + "grad_norm": 0.3829356302509595, + "learning_rate": 5.187560723392019e-06, + "loss": 0.4089, + "step": 3888 + }, + { + "epoch": 2.6969486823855755, + "grad_norm": 0.3747899019215926, + "learning_rate": 5.1851416989529705e-06, + "loss": 0.4109, + "step": 3889 + }, + { + "epoch": 2.6976421636615813, + "grad_norm": 0.36963536144636167, + "learning_rate": 5.182722631118048e-06, + "loss": 0.429, + "step": 3890 + }, + { + "epoch": 2.6983356449375866, + "grad_norm": 0.35026408797808406, + "learning_rate": 5.180303520454263e-06, + "loss": 0.3941, + "step": 3891 + }, + { + "epoch": 2.6990291262135924, + "grad_norm": 0.37951603060911604, + "learning_rate": 5.177884367528637e-06, + "loss": 0.4464, + "step": 3892 + }, + { + "epoch": 2.6997226074895977, + "grad_norm": 0.36419724131904624, + "learning_rate": 5.1754651729082075e-06, + "loss": 0.432, + "step": 3893 + }, + { + "epoch": 2.7004160887656035, + "grad_norm": 0.33490403441036426, + "learning_rate": 5.173045937160011e-06, + "loss": 0.4361, + "step": 3894 + }, + { + "epoch": 2.701109570041609, + "grad_norm": 0.38031713587647936, + "learning_rate": 5.170626660851099e-06, + "loss": 0.3977, + "step": 3895 + }, + { + "epoch": 2.7018030513176146, + "grad_norm": 0.40244117903258303, + "learning_rate": 5.168207344548534e-06, + "loss": 0.4319, + "step": 3896 + }, + { + "epoch": 2.70249653259362, + "grad_norm": 0.3762204461756013, + "learning_rate": 5.165787988819384e-06, + "loss": 0.4057, + "step": 3897 + }, + { + "epoch": 2.7031900138696257, + "grad_norm": 0.3931808032627882, + "learning_rate": 5.163368594230732e-06, + "loss": 0.4209, + "step": 3898 + }, + { + "epoch": 2.703883495145631, + "grad_norm": 0.3350320336599286, + "learning_rate": 5.160949161349665e-06, + "loss": 0.3606, + "step": 3899 + }, + { + "epoch": 2.7045769764216367, + "grad_norm": 0.36981453923450963, + "learning_rate": 5.158529690743279e-06, + "loss": 0.4232, + "step": 3900 + }, + { + "epoch": 2.705270457697642, + "grad_norm": 0.40037405326480896, + "learning_rate": 5.156110182978682e-06, + "loss": 0.4781, + "step": 3901 + }, + { + "epoch": 2.705963938973648, + "grad_norm": 0.35507189122059263, + "learning_rate": 5.153690638622989e-06, + "loss": 0.39, + "step": 3902 + }, + { + "epoch": 2.706657420249653, + "grad_norm": 0.3679326862552085, + "learning_rate": 5.1512710582433246e-06, + "loss": 0.434, + "step": 3903 + }, + { + "epoch": 2.707350901525659, + "grad_norm": 0.3617900153838919, + "learning_rate": 5.148851442406817e-06, + "loss": 0.4125, + "step": 3904 + }, + { + "epoch": 2.7080443828016643, + "grad_norm": 0.4024626544605178, + "learning_rate": 5.1464317916806115e-06, + "loss": 0.4068, + "step": 3905 + }, + { + "epoch": 2.70873786407767, + "grad_norm": 0.3758695499541493, + "learning_rate": 5.1440121066318526e-06, + "loss": 0.4396, + "step": 3906 + }, + { + "epoch": 2.7094313453536754, + "grad_norm": 0.3724970119974219, + "learning_rate": 5.141592387827701e-06, + "loss": 0.4185, + "step": 3907 + }, + { + "epoch": 2.710124826629681, + "grad_norm": 0.4051626400875616, + "learning_rate": 5.1391726358353174e-06, + "loss": 0.3857, + "step": 3908 + }, + { + "epoch": 2.7108183079056865, + "grad_norm": 0.38310603850455355, + "learning_rate": 5.136752851221878e-06, + "loss": 0.4384, + "step": 3909 + }, + { + "epoch": 2.7115117891816922, + "grad_norm": 0.3708684290872625, + "learning_rate": 5.134333034554559e-06, + "loss": 0.4449, + "step": 3910 + }, + { + "epoch": 2.7122052704576975, + "grad_norm": 0.3853142008963356, + "learning_rate": 5.13191318640055e-06, + "loss": 0.4846, + "step": 3911 + }, + { + "epoch": 2.7128987517337033, + "grad_norm": 0.35251127257771286, + "learning_rate": 5.1294933073270455e-06, + "loss": 0.4171, + "step": 3912 + }, + { + "epoch": 2.7135922330097086, + "grad_norm": 0.39257004227116815, + "learning_rate": 5.127073397901248e-06, + "loss": 0.494, + "step": 3913 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 0.365390730712355, + "learning_rate": 5.1246534586903655e-06, + "loss": 0.4257, + "step": 3914 + }, + { + "epoch": 2.7149791955617197, + "grad_norm": 0.34865437840966107, + "learning_rate": 5.122233490261615e-06, + "loss": 0.4062, + "step": 3915 + }, + { + "epoch": 2.7156726768377255, + "grad_norm": 0.4707393739876894, + "learning_rate": 5.119813493182221e-06, + "loss": 0.4109, + "step": 3916 + }, + { + "epoch": 2.716366158113731, + "grad_norm": 0.4518684083272908, + "learning_rate": 5.1173934680194105e-06, + "loss": 0.4118, + "step": 3917 + }, + { + "epoch": 2.7170596393897366, + "grad_norm": 0.37911556292159143, + "learning_rate": 5.114973415340422e-06, + "loss": 0.4346, + "step": 3918 + }, + { + "epoch": 2.717753120665742, + "grad_norm": 0.3737425317971622, + "learning_rate": 5.112553335712497e-06, + "loss": 0.4407, + "step": 3919 + }, + { + "epoch": 2.7184466019417477, + "grad_norm": 0.39800718953842057, + "learning_rate": 5.110133229702886e-06, + "loss": 0.5039, + "step": 3920 + }, + { + "epoch": 2.719140083217753, + "grad_norm": 0.35230826607601595, + "learning_rate": 5.107713097878842e-06, + "loss": 0.43, + "step": 3921 + }, + { + "epoch": 2.719833564493759, + "grad_norm": 0.37002909636834563, + "learning_rate": 5.10529294080763e-06, + "loss": 0.4116, + "step": 3922 + }, + { + "epoch": 2.720527045769764, + "grad_norm": 0.3915145136163304, + "learning_rate": 5.102872759056514e-06, + "loss": 0.4329, + "step": 3923 + }, + { + "epoch": 2.72122052704577, + "grad_norm": 0.3735224585269615, + "learning_rate": 5.100452553192769e-06, + "loss": 0.4364, + "step": 3924 + }, + { + "epoch": 2.721914008321775, + "grad_norm": 0.34175263458947175, + "learning_rate": 5.098032323783673e-06, + "loss": 0.4449, + "step": 3925 + }, + { + "epoch": 2.722607489597781, + "grad_norm": 0.35448996796947957, + "learning_rate": 5.09561207139651e-06, + "loss": 0.4009, + "step": 3926 + }, + { + "epoch": 2.7233009708737863, + "grad_norm": 0.3794775715329324, + "learning_rate": 5.093191796598571e-06, + "loss": 0.3595, + "step": 3927 + }, + { + "epoch": 2.723994452149792, + "grad_norm": 0.4761010583957489, + "learning_rate": 5.090771499957148e-06, + "loss": 0.4123, + "step": 3928 + }, + { + "epoch": 2.7246879334257974, + "grad_norm": 0.344154015482923, + "learning_rate": 5.0883511820395425e-06, + "loss": 0.4533, + "step": 3929 + }, + { + "epoch": 2.725381414701803, + "grad_norm": 0.3862713584563973, + "learning_rate": 5.085930843413062e-06, + "loss": 0.3992, + "step": 3930 + }, + { + "epoch": 2.7260748959778085, + "grad_norm": 0.34012845184880824, + "learning_rate": 5.083510484645013e-06, + "loss": 0.3915, + "step": 3931 + }, + { + "epoch": 2.7267683772538143, + "grad_norm": 0.39418239618454676, + "learning_rate": 5.081090106302711e-06, + "loss": 0.4575, + "step": 3932 + }, + { + "epoch": 2.7274618585298196, + "grad_norm": 0.7232255231081439, + "learning_rate": 5.078669708953475e-06, + "loss": 0.4453, + "step": 3933 + }, + { + "epoch": 2.7281553398058254, + "grad_norm": 0.34048259015435967, + "learning_rate": 5.07624929316463e-06, + "loss": 0.4231, + "step": 3934 + }, + { + "epoch": 2.7288488210818307, + "grad_norm": 0.3794902372204936, + "learning_rate": 5.073828859503504e-06, + "loss": 0.4642, + "step": 3935 + }, + { + "epoch": 2.7295423023578365, + "grad_norm": 0.43782599599154987, + "learning_rate": 5.071408408537426e-06, + "loss": 0.4397, + "step": 3936 + }, + { + "epoch": 2.730235783633842, + "grad_norm": 0.39049158873409967, + "learning_rate": 5.068987940833735e-06, + "loss": 0.405, + "step": 3937 + }, + { + "epoch": 2.7309292649098476, + "grad_norm": 0.36966788682157375, + "learning_rate": 5.066567456959769e-06, + "loss": 0.4246, + "step": 3938 + }, + { + "epoch": 2.731622746185853, + "grad_norm": 0.3573657053426213, + "learning_rate": 5.064146957482875e-06, + "loss": 0.4071, + "step": 3939 + }, + { + "epoch": 2.7323162274618586, + "grad_norm": 0.3575676650714386, + "learning_rate": 5.061726442970398e-06, + "loss": 0.4171, + "step": 3940 + }, + { + "epoch": 2.733009708737864, + "grad_norm": 0.37770593071879655, + "learning_rate": 5.059305913989689e-06, + "loss": 0.4123, + "step": 3941 + }, + { + "epoch": 2.7337031900138697, + "grad_norm": 0.38681474442716546, + "learning_rate": 5.0568853711081045e-06, + "loss": 0.4547, + "step": 3942 + }, + { + "epoch": 2.734396671289875, + "grad_norm": 0.37894669466361824, + "learning_rate": 5.054464814893001e-06, + "loss": 0.4359, + "step": 3943 + }, + { + "epoch": 2.735090152565881, + "grad_norm": 0.370032643094225, + "learning_rate": 5.052044245911739e-06, + "loss": 0.4227, + "step": 3944 + }, + { + "epoch": 2.735783633841886, + "grad_norm": 0.3608744852846485, + "learning_rate": 5.0496236647316825e-06, + "loss": 0.4647, + "step": 3945 + }, + { + "epoch": 2.736477115117892, + "grad_norm": 0.35751879544982534, + "learning_rate": 5.047203071920197e-06, + "loss": 0.4436, + "step": 3946 + }, + { + "epoch": 2.7371705963938973, + "grad_norm": 0.35840300538859254, + "learning_rate": 5.0447824680446555e-06, + "loss": 0.4499, + "step": 3947 + }, + { + "epoch": 2.737864077669903, + "grad_norm": 0.57273099874388, + "learning_rate": 5.042361853672429e-06, + "loss": 0.4364, + "step": 3948 + }, + { + "epoch": 2.7385575589459084, + "grad_norm": 0.7607428146567762, + "learning_rate": 5.039941229370887e-06, + "loss": 0.4293, + "step": 3949 + }, + { + "epoch": 2.739251040221914, + "grad_norm": 0.3815644095081445, + "learning_rate": 5.037520595707411e-06, + "loss": 0.4259, + "step": 3950 + }, + { + "epoch": 2.7399445214979194, + "grad_norm": 0.3348934643038374, + "learning_rate": 5.035099953249381e-06, + "loss": 0.4308, + "step": 3951 + }, + { + "epoch": 2.740638002773925, + "grad_norm": 0.34241204079012694, + "learning_rate": 5.032679302564176e-06, + "loss": 0.4109, + "step": 3952 + }, + { + "epoch": 2.7413314840499305, + "grad_norm": 0.4320558159408776, + "learning_rate": 5.030258644219179e-06, + "loss": 0.4792, + "step": 3953 + }, + { + "epoch": 2.7420249653259363, + "grad_norm": 0.43582167757349133, + "learning_rate": 5.027837978781773e-06, + "loss": 0.4145, + "step": 3954 + }, + { + "epoch": 2.7427184466019416, + "grad_norm": 0.39972661212372834, + "learning_rate": 5.025417306819348e-06, + "loss": 0.4051, + "step": 3955 + }, + { + "epoch": 2.7434119278779474, + "grad_norm": 0.39764227036484173, + "learning_rate": 5.022996628899291e-06, + "loss": 0.5107, + "step": 3956 + }, + { + "epoch": 2.7441054091539527, + "grad_norm": 0.3795203690315098, + "learning_rate": 5.0205759455889904e-06, + "loss": 0.3943, + "step": 3957 + }, + { + "epoch": 2.7447988904299585, + "grad_norm": 0.3675255309526919, + "learning_rate": 5.018155257455835e-06, + "loss": 0.4032, + "step": 3958 + }, + { + "epoch": 2.745492371705964, + "grad_norm": 0.4722710304115148, + "learning_rate": 5.0157345650672206e-06, + "loss": 0.4446, + "step": 3959 + }, + { + "epoch": 2.7461858529819696, + "grad_norm": 0.3529200418459598, + "learning_rate": 5.013313868990538e-06, + "loss": 0.4418, + "step": 3960 + }, + { + "epoch": 2.746879334257975, + "grad_norm": 0.39542475467593047, + "learning_rate": 5.010893169793182e-06, + "loss": 0.4869, + "step": 3961 + }, + { + "epoch": 2.7475728155339807, + "grad_norm": 0.4150561060784059, + "learning_rate": 5.008472468042543e-06, + "loss": 0.3769, + "step": 3962 + }, + { + "epoch": 2.748266296809986, + "grad_norm": 0.34132192843229336, + "learning_rate": 5.006051764306021e-06, + "loss": 0.4068, + "step": 3963 + }, + { + "epoch": 2.748959778085992, + "grad_norm": 0.3441180384829794, + "learning_rate": 5.003631059151008e-06, + "loss": 0.3929, + "step": 3964 + }, + { + "epoch": 2.749653259361997, + "grad_norm": 0.34506553264553497, + "learning_rate": 5.001210353144903e-06, + "loss": 0.4202, + "step": 3965 + }, + { + "epoch": 2.750346740638003, + "grad_norm": 0.39071531681282184, + "learning_rate": 4.998789646855099e-06, + "loss": 0.4251, + "step": 3966 + }, + { + "epoch": 2.751040221914008, + "grad_norm": 0.3904383793654294, + "learning_rate": 4.996368940848992e-06, + "loss": 0.4427, + "step": 3967 + }, + { + "epoch": 2.751733703190014, + "grad_norm": 0.3378095906022995, + "learning_rate": 4.99394823569398e-06, + "loss": 0.3693, + "step": 3968 + }, + { + "epoch": 2.7524271844660193, + "grad_norm": 0.3985577545920935, + "learning_rate": 4.991527531957458e-06, + "loss": 0.4413, + "step": 3969 + }, + { + "epoch": 2.753120665742025, + "grad_norm": 0.3900665096057594, + "learning_rate": 4.98910683020682e-06, + "loss": 0.4239, + "step": 3970 + }, + { + "epoch": 2.7538141470180304, + "grad_norm": 0.3701757399908618, + "learning_rate": 4.986686131009464e-06, + "loss": 0.4546, + "step": 3971 + }, + { + "epoch": 2.754507628294036, + "grad_norm": 0.3714286039272502, + "learning_rate": 4.984265434932781e-06, + "loss": 0.406, + "step": 3972 + }, + { + "epoch": 2.7552011095700415, + "grad_norm": 0.3892384977360426, + "learning_rate": 4.981844742544167e-06, + "loss": 0.4415, + "step": 3973 + }, + { + "epoch": 2.7558945908460473, + "grad_norm": 0.35244289816170765, + "learning_rate": 4.979424054411013e-06, + "loss": 0.3944, + "step": 3974 + }, + { + "epoch": 2.7565880721220526, + "grad_norm": 0.38433471246836537, + "learning_rate": 4.97700337110071e-06, + "loss": 0.4087, + "step": 3975 + }, + { + "epoch": 2.7572815533980584, + "grad_norm": 0.3714735430473568, + "learning_rate": 4.974582693180652e-06, + "loss": 0.427, + "step": 3976 + }, + { + "epoch": 2.7579750346740637, + "grad_norm": 0.37266625958308525, + "learning_rate": 4.972162021218228e-06, + "loss": 0.436, + "step": 3977 + }, + { + "epoch": 2.7586685159500695, + "grad_norm": 0.36155232312722235, + "learning_rate": 4.969741355780822e-06, + "loss": 0.4083, + "step": 3978 + }, + { + "epoch": 2.759361997226075, + "grad_norm": 0.3955245521459925, + "learning_rate": 4.9673206974358254e-06, + "loss": 0.3966, + "step": 3979 + }, + { + "epoch": 2.7600554785020806, + "grad_norm": 0.37149560730018827, + "learning_rate": 4.96490004675062e-06, + "loss": 0.4679, + "step": 3980 + }, + { + "epoch": 2.760748959778086, + "grad_norm": 0.6071953926121999, + "learning_rate": 4.96247940429259e-06, + "loss": 0.4214, + "step": 3981 + }, + { + "epoch": 2.7614424410540916, + "grad_norm": 0.3900352334769149, + "learning_rate": 4.9600587706291146e-06, + "loss": 0.4726, + "step": 3982 + }, + { + "epoch": 2.762135922330097, + "grad_norm": 0.34245855701039796, + "learning_rate": 4.957638146327575e-06, + "loss": 0.402, + "step": 3983 + }, + { + "epoch": 2.7628294036061027, + "grad_norm": 0.3572282010192532, + "learning_rate": 4.9552175319553445e-06, + "loss": 0.4258, + "step": 3984 + }, + { + "epoch": 2.763522884882108, + "grad_norm": 0.3825599970725927, + "learning_rate": 4.9527969280798025e-06, + "loss": 0.4302, + "step": 3985 + }, + { + "epoch": 2.764216366158114, + "grad_norm": 0.37865087687600074, + "learning_rate": 4.950376335268319e-06, + "loss": 0.3837, + "step": 3986 + }, + { + "epoch": 2.764909847434119, + "grad_norm": 0.41425995862562354, + "learning_rate": 4.947955754088263e-06, + "loss": 0.4751, + "step": 3987 + }, + { + "epoch": 2.765603328710125, + "grad_norm": 0.41886747355440135, + "learning_rate": 4.945535185107e-06, + "loss": 0.4312, + "step": 3988 + }, + { + "epoch": 2.7662968099861303, + "grad_norm": 0.3599878162392445, + "learning_rate": 4.943114628891897e-06, + "loss": 0.4714, + "step": 3989 + }, + { + "epoch": 2.766990291262136, + "grad_norm": 0.4301473507839401, + "learning_rate": 4.940694086010312e-06, + "loss": 0.463, + "step": 3990 + }, + { + "epoch": 2.7676837725381414, + "grad_norm": 0.3545771469108082, + "learning_rate": 4.938273557029604e-06, + "loss": 0.4024, + "step": 3991 + }, + { + "epoch": 2.768377253814147, + "grad_norm": 0.3973740084240133, + "learning_rate": 4.935853042517127e-06, + "loss": 0.455, + "step": 3992 + }, + { + "epoch": 2.7690707350901524, + "grad_norm": 0.3534259407642916, + "learning_rate": 4.933432543040232e-06, + "loss": 0.4356, + "step": 3993 + }, + { + "epoch": 2.769764216366158, + "grad_norm": 0.39654648286811706, + "learning_rate": 4.931012059166267e-06, + "loss": 0.394, + "step": 3994 + }, + { + "epoch": 2.7704576976421635, + "grad_norm": 0.37925078010425023, + "learning_rate": 4.928591591462575e-06, + "loss": 0.427, + "step": 3995 + }, + { + "epoch": 2.7711511789181693, + "grad_norm": 0.4167478976360642, + "learning_rate": 4.926171140496498e-06, + "loss": 0.4495, + "step": 3996 + }, + { + "epoch": 2.7718446601941746, + "grad_norm": 0.3506857658306157, + "learning_rate": 4.923750706835371e-06, + "loss": 0.4168, + "step": 3997 + }, + { + "epoch": 2.7725381414701804, + "grad_norm": 0.3622577748822191, + "learning_rate": 4.921330291046526e-06, + "loss": 0.3713, + "step": 3998 + }, + { + "epoch": 2.7732316227461857, + "grad_norm": 0.37554857340453845, + "learning_rate": 4.91890989369729e-06, + "loss": 0.3883, + "step": 3999 + }, + { + "epoch": 2.7739251040221915, + "grad_norm": 0.4537593666625834, + "learning_rate": 4.9164895153549894e-06, + "loss": 0.4532, + "step": 4000 + }, + { + "epoch": 2.774618585298197, + "grad_norm": 0.4054612480562233, + "learning_rate": 4.914069156586941e-06, + "loss": 0.4023, + "step": 4001 + }, + { + "epoch": 2.7753120665742026, + "grad_norm": 0.3805519937829709, + "learning_rate": 4.9116488179604575e-06, + "loss": 0.4106, + "step": 4002 + }, + { + "epoch": 2.776005547850208, + "grad_norm": 0.38545256445163834, + "learning_rate": 4.909228500042852e-06, + "loss": 0.4103, + "step": 4003 + }, + { + "epoch": 2.7766990291262137, + "grad_norm": 0.9463666155927506, + "learning_rate": 4.9068082034014305e-06, + "loss": 0.465, + "step": 4004 + }, + { + "epoch": 2.777392510402219, + "grad_norm": 0.4007506620101125, + "learning_rate": 4.904387928603491e-06, + "loss": 0.4682, + "step": 4005 + }, + { + "epoch": 2.778085991678225, + "grad_norm": 0.38684697926971456, + "learning_rate": 4.901967676216329e-06, + "loss": 0.4115, + "step": 4006 + }, + { + "epoch": 2.77877947295423, + "grad_norm": 0.34583373114051424, + "learning_rate": 4.899547446807232e-06, + "loss": 0.4459, + "step": 4007 + }, + { + "epoch": 2.779472954230236, + "grad_norm": 0.38627086393795584, + "learning_rate": 4.897127240943487e-06, + "loss": 0.4446, + "step": 4008 + }, + { + "epoch": 2.780166435506241, + "grad_norm": 0.3883560058879765, + "learning_rate": 4.894707059192372e-06, + "loss": 0.388, + "step": 4009 + }, + { + "epoch": 2.780859916782247, + "grad_norm": 0.3503374636503669, + "learning_rate": 4.892286902121159e-06, + "loss": 0.4266, + "step": 4010 + }, + { + "epoch": 2.7815533980582523, + "grad_norm": 0.350352145826333, + "learning_rate": 4.889866770297116e-06, + "loss": 0.4395, + "step": 4011 + }, + { + "epoch": 2.782246879334258, + "grad_norm": 0.3642627761438329, + "learning_rate": 4.887446664287504e-06, + "loss": 0.4272, + "step": 4012 + }, + { + "epoch": 2.7829403606102634, + "grad_norm": 0.3434874097966225, + "learning_rate": 4.885026584659579e-06, + "loss": 0.4149, + "step": 4013 + }, + { + "epoch": 2.783633841886269, + "grad_norm": 0.3621844235954397, + "learning_rate": 4.882606531980591e-06, + "loss": 0.3828, + "step": 4014 + }, + { + "epoch": 2.7843273231622745, + "grad_norm": 0.3427016163640971, + "learning_rate": 4.880186506817781e-06, + "loss": 0.4345, + "step": 4015 + }, + { + "epoch": 2.7850208044382803, + "grad_norm": 0.36054048320306425, + "learning_rate": 4.877766509738386e-06, + "loss": 0.4492, + "step": 4016 + }, + { + "epoch": 2.7857142857142856, + "grad_norm": 0.32912577313513003, + "learning_rate": 4.875346541309637e-06, + "loss": 0.3833, + "step": 4017 + }, + { + "epoch": 2.7864077669902914, + "grad_norm": 0.35524338466273975, + "learning_rate": 4.872926602098756e-06, + "loss": 0.381, + "step": 4018 + }, + { + "epoch": 2.7871012482662967, + "grad_norm": 0.40455749188681955, + "learning_rate": 4.870506692672957e-06, + "loss": 0.4709, + "step": 4019 + }, + { + "epoch": 2.7877947295423025, + "grad_norm": 0.40182790959614184, + "learning_rate": 4.86808681359945e-06, + "loss": 0.4469, + "step": 4020 + }, + { + "epoch": 2.7884882108183078, + "grad_norm": 0.3965258452569223, + "learning_rate": 4.865666965445442e-06, + "loss": 0.4475, + "step": 4021 + }, + { + "epoch": 2.7891816920943135, + "grad_norm": 0.3877523551800925, + "learning_rate": 4.863247148778124e-06, + "loss": 0.4664, + "step": 4022 + }, + { + "epoch": 2.789875173370319, + "grad_norm": 0.3586594465868982, + "learning_rate": 4.860827364164683e-06, + "loss": 0.4111, + "step": 4023 + }, + { + "epoch": 2.7905686546463246, + "grad_norm": 0.37368248000961435, + "learning_rate": 4.8584076121723e-06, + "loss": 0.414, + "step": 4024 + }, + { + "epoch": 2.79126213592233, + "grad_norm": 0.386644055137649, + "learning_rate": 4.855987893368148e-06, + "loss": 0.4293, + "step": 4025 + }, + { + "epoch": 2.7919556171983357, + "grad_norm": 0.36677929808420884, + "learning_rate": 4.853568208319391e-06, + "loss": 0.4321, + "step": 4026 + }, + { + "epoch": 2.792649098474341, + "grad_norm": 0.3789945636294229, + "learning_rate": 4.851148557593185e-06, + "loss": 0.3937, + "step": 4027 + }, + { + "epoch": 2.793342579750347, + "grad_norm": 0.3480247387809359, + "learning_rate": 4.848728941756679e-06, + "loss": 0.439, + "step": 4028 + }, + { + "epoch": 2.794036061026352, + "grad_norm": 0.36352830102050243, + "learning_rate": 4.846309361377011e-06, + "loss": 0.4447, + "step": 4029 + }, + { + "epoch": 2.794729542302358, + "grad_norm": 0.366325211500067, + "learning_rate": 4.843889817021318e-06, + "loss": 0.3997, + "step": 4030 + }, + { + "epoch": 2.7954230235783633, + "grad_norm": 0.3545734829458388, + "learning_rate": 4.841470309256722e-06, + "loss": 0.4299, + "step": 4031 + }, + { + "epoch": 2.796116504854369, + "grad_norm": 0.5004600048170659, + "learning_rate": 4.839050838650336e-06, + "loss": 0.4791, + "step": 4032 + }, + { + "epoch": 2.7968099861303743, + "grad_norm": 0.4024612900778736, + "learning_rate": 4.8366314057692684e-06, + "loss": 0.4252, + "step": 4033 + }, + { + "epoch": 2.79750346740638, + "grad_norm": 0.35560712170623915, + "learning_rate": 4.834212011180617e-06, + "loss": 0.4092, + "step": 4034 + }, + { + "epoch": 2.7981969486823854, + "grad_norm": 0.3856876940046428, + "learning_rate": 4.831792655451468e-06, + "loss": 0.4575, + "step": 4035 + }, + { + "epoch": 2.798890429958391, + "grad_norm": 0.3591191578303279, + "learning_rate": 4.829373339148903e-06, + "loss": 0.4049, + "step": 4036 + }, + { + "epoch": 2.7995839112343965, + "grad_norm": 0.37283837251655866, + "learning_rate": 4.8269540628399925e-06, + "loss": 0.4596, + "step": 4037 + }, + { + "epoch": 2.8002773925104023, + "grad_norm": 0.3845580357058165, + "learning_rate": 4.824534827091793e-06, + "loss": 0.3955, + "step": 4038 + }, + { + "epoch": 2.8009708737864076, + "grad_norm": 0.3552336919736331, + "learning_rate": 4.822115632471363e-06, + "loss": 0.4439, + "step": 4039 + }, + { + "epoch": 2.8016643550624134, + "grad_norm": 0.3508682336015801, + "learning_rate": 4.819696479545738e-06, + "loss": 0.3975, + "step": 4040 + }, + { + "epoch": 2.8023578363384187, + "grad_norm": 0.4022142157664882, + "learning_rate": 4.817277368881954e-06, + "loss": 0.4128, + "step": 4041 + }, + { + "epoch": 2.8030513176144245, + "grad_norm": 0.4143632286344754, + "learning_rate": 4.814858301047031e-06, + "loss": 0.3935, + "step": 4042 + }, + { + "epoch": 2.80374479889043, + "grad_norm": 0.3959569165255466, + "learning_rate": 4.812439276607982e-06, + "loss": 0.4562, + "step": 4043 + }, + { + "epoch": 2.8044382801664356, + "grad_norm": 0.3551192583821301, + "learning_rate": 4.810020296131807e-06, + "loss": 0.4497, + "step": 4044 + }, + { + "epoch": 2.805131761442441, + "grad_norm": 0.42301204900694334, + "learning_rate": 4.8076013601854996e-06, + "loss": 0.4257, + "step": 4045 + }, + { + "epoch": 2.8058252427184467, + "grad_norm": 0.533627383552703, + "learning_rate": 4.80518246933604e-06, + "loss": 0.3952, + "step": 4046 + }, + { + "epoch": 2.806518723994452, + "grad_norm": 0.32971782919941844, + "learning_rate": 4.802763624150396e-06, + "loss": 0.4018, + "step": 4047 + }, + { + "epoch": 2.807212205270458, + "grad_norm": 0.3236523786576215, + "learning_rate": 4.800344825195533e-06, + "loss": 0.364, + "step": 4048 + }, + { + "epoch": 2.807905686546463, + "grad_norm": 0.3884875215326674, + "learning_rate": 4.7979260730383954e-06, + "loss": 0.427, + "step": 4049 + }, + { + "epoch": 2.808599167822469, + "grad_norm": 0.3687340718711838, + "learning_rate": 4.795507368245924e-06, + "loss": 0.4238, + "step": 4050 + }, + { + "epoch": 2.809292649098474, + "grad_norm": 0.3945178218662022, + "learning_rate": 4.793088711385044e-06, + "loss": 0.4485, + "step": 4051 + }, + { + "epoch": 2.80998613037448, + "grad_norm": 0.37202183009946754, + "learning_rate": 4.790670103022672e-06, + "loss": 0.4337, + "step": 4052 + }, + { + "epoch": 2.8106796116504853, + "grad_norm": 0.38427164244341955, + "learning_rate": 4.788251543725711e-06, + "loss": 0.4183, + "step": 4053 + }, + { + "epoch": 2.811373092926491, + "grad_norm": 0.33786128218826866, + "learning_rate": 4.785833034061056e-06, + "loss": 0.4126, + "step": 4054 + }, + { + "epoch": 2.8120665742024964, + "grad_norm": 0.3856524875438802, + "learning_rate": 4.783414574595585e-06, + "loss": 0.4988, + "step": 4055 + }, + { + "epoch": 2.812760055478502, + "grad_norm": 0.3707925529065902, + "learning_rate": 4.780996165896169e-06, + "loss": 0.3866, + "step": 4056 + }, + { + "epoch": 2.8134535367545075, + "grad_norm": 0.3987336385136071, + "learning_rate": 4.778577808529666e-06, + "loss": 0.4327, + "step": 4057 + }, + { + "epoch": 2.8141470180305133, + "grad_norm": 0.3696563322135868, + "learning_rate": 4.776159503062922e-06, + "loss": 0.4161, + "step": 4058 + }, + { + "epoch": 2.8148404993065186, + "grad_norm": 0.3829064231711528, + "learning_rate": 4.7737412500627694e-06, + "loss": 0.4305, + "step": 4059 + }, + { + "epoch": 2.8155339805825244, + "grad_norm": 0.42212570468835886, + "learning_rate": 4.771323050096028e-06, + "loss": 0.402, + "step": 4060 + }, + { + "epoch": 2.8162274618585297, + "grad_norm": 0.3508844344112231, + "learning_rate": 4.768904903729509e-06, + "loss": 0.3635, + "step": 4061 + }, + { + "epoch": 2.8169209431345354, + "grad_norm": 0.38245947414316256, + "learning_rate": 4.766486811530006e-06, + "loss": 0.4268, + "step": 4062 + }, + { + "epoch": 2.8176144244105408, + "grad_norm": 0.34169649762997983, + "learning_rate": 4.764068774064304e-06, + "loss": 0.3638, + "step": 4063 + }, + { + "epoch": 2.8183079056865465, + "grad_norm": 0.40942647151390993, + "learning_rate": 4.76165079189917e-06, + "loss": 0.4022, + "step": 4064 + }, + { + "epoch": 2.819001386962552, + "grad_norm": 0.4402489576229297, + "learning_rate": 4.759232865601366e-06, + "loss": 0.4559, + "step": 4065 + }, + { + "epoch": 2.8196948682385576, + "grad_norm": 0.348823874503297, + "learning_rate": 4.756814995737635e-06, + "loss": 0.4472, + "step": 4066 + }, + { + "epoch": 2.820388349514563, + "grad_norm": 0.37775853749888355, + "learning_rate": 4.754397182874708e-06, + "loss": 0.4074, + "step": 4067 + }, + { + "epoch": 2.8210818307905687, + "grad_norm": 0.4257208050903815, + "learning_rate": 4.7519794275793015e-06, + "loss": 0.4482, + "step": 4068 + }, + { + "epoch": 2.821775312066574, + "grad_norm": 0.3521327762265513, + "learning_rate": 4.749561730418121e-06, + "loss": 0.4219, + "step": 4069 + }, + { + "epoch": 2.82246879334258, + "grad_norm": 0.34980838516587615, + "learning_rate": 4.7471440919578585e-06, + "loss": 0.428, + "step": 4070 + }, + { + "epoch": 2.823162274618585, + "grad_norm": 0.4258849417751294, + "learning_rate": 4.744726512765189e-06, + "loss": 0.4413, + "step": 4071 + }, + { + "epoch": 2.823855755894591, + "grad_norm": 0.3513473352205952, + "learning_rate": 4.742308993406775e-06, + "loss": 0.3892, + "step": 4072 + }, + { + "epoch": 2.8245492371705962, + "grad_norm": 0.3518035036634018, + "learning_rate": 4.739891534449267e-06, + "loss": 0.468, + "step": 4073 + }, + { + "epoch": 2.825242718446602, + "grad_norm": 0.390328201896898, + "learning_rate": 4.7374741364592995e-06, + "loss": 0.414, + "step": 4074 + }, + { + "epoch": 2.8259361997226073, + "grad_norm": 0.38985055994312234, + "learning_rate": 4.735056800003494e-06, + "loss": 0.418, + "step": 4075 + }, + { + "epoch": 2.826629680998613, + "grad_norm": 0.4179553528250532, + "learning_rate": 4.732639525648456e-06, + "loss": 0.4692, + "step": 4076 + }, + { + "epoch": 2.8273231622746184, + "grad_norm": 0.34855373666352646, + "learning_rate": 4.730222313960776e-06, + "loss": 0.4048, + "step": 4077 + }, + { + "epoch": 2.828016643550624, + "grad_norm": 0.4185433830631261, + "learning_rate": 4.727805165507032e-06, + "loss": 0.4568, + "step": 4078 + }, + { + "epoch": 2.8287101248266295, + "grad_norm": 0.3827686595456162, + "learning_rate": 4.725388080853786e-06, + "loss": 0.3943, + "step": 4079 + }, + { + "epoch": 2.8294036061026353, + "grad_norm": 0.3868924451054122, + "learning_rate": 4.722971060567584e-06, + "loss": 0.448, + "step": 4080 + }, + { + "epoch": 2.8300970873786406, + "grad_norm": 0.3440702513702837, + "learning_rate": 4.720554105214961e-06, + "loss": 0.4354, + "step": 4081 + }, + { + "epoch": 2.8307905686546464, + "grad_norm": 0.37438743596755825, + "learning_rate": 4.718137215362429e-06, + "loss": 0.4763, + "step": 4082 + }, + { + "epoch": 2.8314840499306517, + "grad_norm": 0.41864735318001417, + "learning_rate": 4.715720391576495e-06, + "loss": 0.3939, + "step": 4083 + }, + { + "epoch": 2.8321775312066575, + "grad_norm": 0.3734249927386904, + "learning_rate": 4.713303634423642e-06, + "loss": 0.4359, + "step": 4084 + }, + { + "epoch": 2.832871012482663, + "grad_norm": 0.3290090027750609, + "learning_rate": 4.71088694447034e-06, + "loss": 0.3989, + "step": 4085 + }, + { + "epoch": 2.8335644937586686, + "grad_norm": 0.36094920238835954, + "learning_rate": 4.708470322283045e-06, + "loss": 0.4524, + "step": 4086 + }, + { + "epoch": 2.834257975034674, + "grad_norm": 0.4766922360121868, + "learning_rate": 4.706053768428195e-06, + "loss": 0.4193, + "step": 4087 + }, + { + "epoch": 2.8349514563106797, + "grad_norm": 0.3698517377046595, + "learning_rate": 4.703637283472213e-06, + "loss": 0.4247, + "step": 4088 + }, + { + "epoch": 2.835644937586685, + "grad_norm": 0.3363697038046532, + "learning_rate": 4.701220867981505e-06, + "loss": 0.3703, + "step": 4089 + }, + { + "epoch": 2.836338418862691, + "grad_norm": 0.386392636656102, + "learning_rate": 4.698804522522462e-06, + "loss": 0.4365, + "step": 4090 + }, + { + "epoch": 2.837031900138696, + "grad_norm": 0.4060050410469409, + "learning_rate": 4.6963882476614555e-06, + "loss": 0.3954, + "step": 4091 + }, + { + "epoch": 2.837725381414702, + "grad_norm": 0.3523887208184266, + "learning_rate": 4.6939720439648465e-06, + "loss": 0.4254, + "step": 4092 + }, + { + "epoch": 2.838418862690707, + "grad_norm": 0.3579985074450476, + "learning_rate": 4.691555911998975e-06, + "loss": 0.4395, + "step": 4093 + }, + { + "epoch": 2.839112343966713, + "grad_norm": 0.4029662350950304, + "learning_rate": 4.689139852330162e-06, + "loss": 0.4546, + "step": 4094 + }, + { + "epoch": 2.8398058252427183, + "grad_norm": 0.35050211108750073, + "learning_rate": 4.686723865524718e-06, + "loss": 0.4299, + "step": 4095 + }, + { + "epoch": 2.840499306518724, + "grad_norm": 0.3451123976059286, + "learning_rate": 4.684307952148931e-06, + "loss": 0.4004, + "step": 4096 + }, + { + "epoch": 2.8411927877947294, + "grad_norm": 0.3796932050622585, + "learning_rate": 4.681892112769072e-06, + "loss": 0.4321, + "step": 4097 + }, + { + "epoch": 2.841886269070735, + "grad_norm": 0.38961027642341317, + "learning_rate": 4.6794763479514e-06, + "loss": 0.4479, + "step": 4098 + }, + { + "epoch": 2.8425797503467405, + "grad_norm": 0.3620956435495506, + "learning_rate": 4.677060658262151e-06, + "loss": 0.4502, + "step": 4099 + }, + { + "epoch": 2.8432732316227463, + "grad_norm": 0.3694923229023742, + "learning_rate": 4.674645044267541e-06, + "loss": 0.4181, + "step": 4100 + }, + { + "epoch": 2.8439667128987516, + "grad_norm": 0.3806111379567479, + "learning_rate": 4.672229506533779e-06, + "loss": 0.4987, + "step": 4101 + }, + { + "epoch": 2.8446601941747574, + "grad_norm": 0.3803688644623606, + "learning_rate": 4.669814045627046e-06, + "loss": 0.4148, + "step": 4102 + }, + { + "epoch": 2.8453536754507627, + "grad_norm": 0.382423290737069, + "learning_rate": 4.667398662113511e-06, + "loss": 0.4808, + "step": 4103 + }, + { + "epoch": 2.8460471567267684, + "grad_norm": 0.3370787002408229, + "learning_rate": 4.664983356559321e-06, + "loss": 0.3859, + "step": 4104 + }, + { + "epoch": 2.8467406380027738, + "grad_norm": 0.38263104851654395, + "learning_rate": 4.662568129530603e-06, + "loss": 0.4449, + "step": 4105 + }, + { + "epoch": 2.8474341192787795, + "grad_norm": 0.3888058065974024, + "learning_rate": 4.660152981593474e-06, + "loss": 0.4272, + "step": 4106 + }, + { + "epoch": 2.848127600554785, + "grad_norm": 0.3455810666774053, + "learning_rate": 4.657737913314025e-06, + "loss": 0.423, + "step": 4107 + }, + { + "epoch": 2.8488210818307906, + "grad_norm": 0.375943631321159, + "learning_rate": 4.65532292525833e-06, + "loss": 0.4528, + "step": 4108 + }, + { + "epoch": 2.849514563106796, + "grad_norm": 0.3766495348477, + "learning_rate": 4.652908017992443e-06, + "loss": 0.4733, + "step": 4109 + }, + { + "epoch": 2.8502080443828017, + "grad_norm": 0.3484261311648054, + "learning_rate": 4.650493192082404e-06, + "loss": 0.4103, + "step": 4110 + }, + { + "epoch": 2.850901525658807, + "grad_norm": 0.3383124399669671, + "learning_rate": 4.64807844809423e-06, + "loss": 0.4131, + "step": 4111 + }, + { + "epoch": 2.851595006934813, + "grad_norm": 0.41783192949638315, + "learning_rate": 4.64566378659392e-06, + "loss": 0.4321, + "step": 4112 + }, + { + "epoch": 2.852288488210818, + "grad_norm": 0.36766480464672224, + "learning_rate": 4.643249208147452e-06, + "loss": 0.4065, + "step": 4113 + }, + { + "epoch": 2.852981969486824, + "grad_norm": 0.4019405627514558, + "learning_rate": 4.640834713320785e-06, + "loss": 0.4298, + "step": 4114 + }, + { + "epoch": 2.8536754507628292, + "grad_norm": 0.40944835516505707, + "learning_rate": 4.63842030267986e-06, + "loss": 0.4431, + "step": 4115 + }, + { + "epoch": 2.854368932038835, + "grad_norm": 0.3809806904706159, + "learning_rate": 4.6360059767905975e-06, + "loss": 0.4633, + "step": 4116 + }, + { + "epoch": 2.8550624133148403, + "grad_norm": 0.36243598662449916, + "learning_rate": 4.6335917362188975e-06, + "loss": 0.4365, + "step": 4117 + }, + { + "epoch": 2.855755894590846, + "grad_norm": 0.34285130789114643, + "learning_rate": 4.63117758153064e-06, + "loss": 0.3951, + "step": 4118 + }, + { + "epoch": 2.8564493758668514, + "grad_norm": 0.3470943676193009, + "learning_rate": 4.628763513291687e-06, + "loss": 0.4373, + "step": 4119 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.3833804947544678, + "learning_rate": 4.626349532067879e-06, + "loss": 0.3982, + "step": 4120 + }, + { + "epoch": 2.8578363384188625, + "grad_norm": 0.3550169695072367, + "learning_rate": 4.623935638425034e-06, + "loss": 0.3878, + "step": 4121 + }, + { + "epoch": 2.8585298196948683, + "grad_norm": 0.37416667163524014, + "learning_rate": 4.621521832928951e-06, + "loss": 0.4041, + "step": 4122 + }, + { + "epoch": 2.8592233009708736, + "grad_norm": 0.3630927983036487, + "learning_rate": 4.619108116145411e-06, + "loss": 0.4001, + "step": 4123 + }, + { + "epoch": 2.8599167822468794, + "grad_norm": 0.38279124194809155, + "learning_rate": 4.616694488640169e-06, + "loss": 0.418, + "step": 4124 + }, + { + "epoch": 2.8606102635228847, + "grad_norm": 0.36234016663909513, + "learning_rate": 4.614280950978964e-06, + "loss": 0.4704, + "step": 4125 + }, + { + "epoch": 2.8613037447988905, + "grad_norm": 0.42495213346921995, + "learning_rate": 4.611867503727508e-06, + "loss": 0.4214, + "step": 4126 + }, + { + "epoch": 2.861997226074896, + "grad_norm": 0.36159187678580645, + "learning_rate": 4.6094541474514985e-06, + "loss": 0.4273, + "step": 4127 + }, + { + "epoch": 2.8626907073509016, + "grad_norm": 0.3408973458656512, + "learning_rate": 4.607040882716609e-06, + "loss": 0.3798, + "step": 4128 + }, + { + "epoch": 2.863384188626907, + "grad_norm": 0.40945625197433955, + "learning_rate": 4.604627710088492e-06, + "loss": 0.4845, + "step": 4129 + }, + { + "epoch": 2.8640776699029127, + "grad_norm": 0.4645050313987877, + "learning_rate": 4.6022146301327755e-06, + "loss": 0.4545, + "step": 4130 + }, + { + "epoch": 2.864771151178918, + "grad_norm": 0.4146496248402175, + "learning_rate": 4.599801643415069e-06, + "loss": 0.4404, + "step": 4131 + }, + { + "epoch": 2.8654646324549238, + "grad_norm": 0.3298645925276056, + "learning_rate": 4.597388750500959e-06, + "loss": 0.3853, + "step": 4132 + }, + { + "epoch": 2.866158113730929, + "grad_norm": 0.3754955120945029, + "learning_rate": 4.59497595195601e-06, + "loss": 0.4285, + "step": 4133 + }, + { + "epoch": 2.866851595006935, + "grad_norm": 0.3999373734210744, + "learning_rate": 4.5925632483457635e-06, + "loss": 0.4232, + "step": 4134 + }, + { + "epoch": 2.86754507628294, + "grad_norm": 0.3331439598554828, + "learning_rate": 4.590150640235742e-06, + "loss": 0.4236, + "step": 4135 + }, + { + "epoch": 2.868238557558946, + "grad_norm": 0.36819641326853636, + "learning_rate": 4.58773812819144e-06, + "loss": 0.4369, + "step": 4136 + }, + { + "epoch": 2.8689320388349513, + "grad_norm": 0.37453329326162216, + "learning_rate": 4.585325712778338e-06, + "loss": 0.433, + "step": 4137 + }, + { + "epoch": 2.869625520110957, + "grad_norm": 0.35624968189002554, + "learning_rate": 4.582913394561884e-06, + "loss": 0.4452, + "step": 4138 + }, + { + "epoch": 2.8703190013869624, + "grad_norm": 0.37477086333955983, + "learning_rate": 4.5805011741075095e-06, + "loss": 0.4541, + "step": 4139 + }, + { + "epoch": 2.871012482662968, + "grad_norm": 0.3876290327732691, + "learning_rate": 4.578089051980622e-06, + "loss": 0.4416, + "step": 4140 + }, + { + "epoch": 2.8717059639389735, + "grad_norm": 0.3953283849497786, + "learning_rate": 4.575677028746606e-06, + "loss": 0.4902, + "step": 4141 + }, + { + "epoch": 2.8723994452149793, + "grad_norm": 0.3561636683250079, + "learning_rate": 4.57326510497082e-06, + "loss": 0.4452, + "step": 4142 + }, + { + "epoch": 2.8730929264909846, + "grad_norm": 0.3766848888549487, + "learning_rate": 4.570853281218605e-06, + "loss": 0.4119, + "step": 4143 + }, + { + "epoch": 2.8737864077669903, + "grad_norm": 0.3740385918636498, + "learning_rate": 4.568441558055271e-06, + "loss": 0.4069, + "step": 4144 + }, + { + "epoch": 2.8744798890429957, + "grad_norm": 0.36726727356780703, + "learning_rate": 4.566029936046109e-06, + "loss": 0.4816, + "step": 4145 + }, + { + "epoch": 2.8751733703190014, + "grad_norm": 0.36866803601355214, + "learning_rate": 4.563618415756389e-06, + "loss": 0.3683, + "step": 4146 + }, + { + "epoch": 2.875866851595007, + "grad_norm": 0.39472160529342576, + "learning_rate": 4.561206997751352e-06, + "loss": 0.4203, + "step": 4147 + }, + { + "epoch": 2.8765603328710125, + "grad_norm": 0.3442874064224455, + "learning_rate": 4.558795682596216e-06, + "loss": 0.4165, + "step": 4148 + }, + { + "epoch": 2.877253814147018, + "grad_norm": 0.38644588415967007, + "learning_rate": 4.556384470856177e-06, + "loss": 0.4127, + "step": 4149 + }, + { + "epoch": 2.8779472954230236, + "grad_norm": 0.3619612092465305, + "learning_rate": 4.553973363096405e-06, + "loss": 0.4054, + "step": 4150 + }, + { + "epoch": 2.8786407766990294, + "grad_norm": 0.3448337637953116, + "learning_rate": 4.551562359882048e-06, + "loss": 0.4177, + "step": 4151 + }, + { + "epoch": 2.8793342579750347, + "grad_norm": 0.38364920269799246, + "learning_rate": 4.549151461778225e-06, + "loss": 0.4563, + "step": 4152 + }, + { + "epoch": 2.88002773925104, + "grad_norm": 0.3723559507453039, + "learning_rate": 4.546740669350034e-06, + "loss": 0.4109, + "step": 4153 + }, + { + "epoch": 2.880721220527046, + "grad_norm": 0.39681929155284845, + "learning_rate": 4.5443299831625455e-06, + "loss": 0.4478, + "step": 4154 + }, + { + "epoch": 2.8814147018030516, + "grad_norm": 0.3586297582322779, + "learning_rate": 4.54191940378081e-06, + "loss": 0.4132, + "step": 4155 + }, + { + "epoch": 2.882108183079057, + "grad_norm": 0.5746156961669232, + "learning_rate": 4.53950893176985e-06, + "loss": 0.3897, + "step": 4156 + }, + { + "epoch": 2.8828016643550622, + "grad_norm": 0.38865913616338227, + "learning_rate": 4.537098567694661e-06, + "loss": 0.4411, + "step": 4157 + }, + { + "epoch": 2.883495145631068, + "grad_norm": 0.39901877163538707, + "learning_rate": 4.534688312120216e-06, + "loss": 0.4294, + "step": 4158 + }, + { + "epoch": 2.884188626907074, + "grad_norm": 0.37308151176484683, + "learning_rate": 4.532278165611459e-06, + "loss": 0.4531, + "step": 4159 + }, + { + "epoch": 2.884882108183079, + "grad_norm": 0.3784354701011851, + "learning_rate": 4.529868128733314e-06, + "loss": 0.447, + "step": 4160 + }, + { + "epoch": 2.8855755894590844, + "grad_norm": 0.38127303713728083, + "learning_rate": 4.527458202050674e-06, + "loss": 0.4612, + "step": 4161 + }, + { + "epoch": 2.88626907073509, + "grad_norm": 0.377705214411384, + "learning_rate": 4.525048386128409e-06, + "loss": 0.5111, + "step": 4162 + }, + { + "epoch": 2.886962552011096, + "grad_norm": 0.3841428778251947, + "learning_rate": 4.522638681531361e-06, + "loss": 0.4412, + "step": 4163 + }, + { + "epoch": 2.8876560332871013, + "grad_norm": 0.3954687219562789, + "learning_rate": 4.52022908882435e-06, + "loss": 0.442, + "step": 4164 + }, + { + "epoch": 2.8883495145631066, + "grad_norm": 0.35690896440635167, + "learning_rate": 4.5178196085721675e-06, + "loss": 0.4204, + "step": 4165 + }, + { + "epoch": 2.8890429958391124, + "grad_norm": 0.36102101537630227, + "learning_rate": 4.5154102413395766e-06, + "loss": 0.4556, + "step": 4166 + }, + { + "epoch": 2.889736477115118, + "grad_norm": 0.36307046359821055, + "learning_rate": 4.513000987691314e-06, + "loss": 0.4857, + "step": 4167 + }, + { + "epoch": 2.8904299583911235, + "grad_norm": 0.3551096280195344, + "learning_rate": 4.510591848192093e-06, + "loss": 0.4005, + "step": 4168 + }, + { + "epoch": 2.891123439667129, + "grad_norm": 0.3608771211593707, + "learning_rate": 4.508182823406599e-06, + "loss": 0.4605, + "step": 4169 + }, + { + "epoch": 2.8918169209431346, + "grad_norm": 0.3809405439867942, + "learning_rate": 4.50577391389949e-06, + "loss": 0.4829, + "step": 4170 + }, + { + "epoch": 2.8925104022191404, + "grad_norm": 0.3869152966360628, + "learning_rate": 4.503365120235395e-06, + "loss": 0.4496, + "step": 4171 + }, + { + "epoch": 2.8932038834951457, + "grad_norm": 0.36587027722390336, + "learning_rate": 4.500956442978918e-06, + "loss": 0.459, + "step": 4172 + }, + { + "epoch": 2.893897364771151, + "grad_norm": 0.33863471910286297, + "learning_rate": 4.498547882694637e-06, + "loss": 0.4278, + "step": 4173 + }, + { + "epoch": 2.8945908460471568, + "grad_norm": 0.35048346242787526, + "learning_rate": 4.496139439947103e-06, + "loss": 0.4588, + "step": 4174 + }, + { + "epoch": 2.8952843273231625, + "grad_norm": 0.3645785826197352, + "learning_rate": 4.493731115300832e-06, + "loss": 0.4172, + "step": 4175 + }, + { + "epoch": 2.895977808599168, + "grad_norm": 0.40875720726240106, + "learning_rate": 4.491322909320324e-06, + "loss": 0.4195, + "step": 4176 + }, + { + "epoch": 2.896671289875173, + "grad_norm": 0.31418901606969907, + "learning_rate": 4.4889148225700406e-06, + "loss": 0.3926, + "step": 4177 + }, + { + "epoch": 2.897364771151179, + "grad_norm": 0.3740156771524573, + "learning_rate": 4.486506855614422e-06, + "loss": 0.4057, + "step": 4178 + }, + { + "epoch": 2.8980582524271847, + "grad_norm": 0.38590233527498613, + "learning_rate": 4.484099009017876e-06, + "loss": 0.4252, + "step": 4179 + }, + { + "epoch": 2.89875173370319, + "grad_norm": 0.37572662377068333, + "learning_rate": 4.481691283344787e-06, + "loss": 0.4099, + "step": 4180 + }, + { + "epoch": 2.8994452149791954, + "grad_norm": 0.3816752535776974, + "learning_rate": 4.479283679159506e-06, + "loss": 0.4263, + "step": 4181 + }, + { + "epoch": 2.900138696255201, + "grad_norm": 0.359232261823802, + "learning_rate": 4.476876197026362e-06, + "loss": 0.4189, + "step": 4182 + }, + { + "epoch": 2.900832177531207, + "grad_norm": 0.4311357319828042, + "learning_rate": 4.4744688375096475e-06, + "loss": 0.4477, + "step": 4183 + }, + { + "epoch": 2.9015256588072122, + "grad_norm": 0.44353577407545236, + "learning_rate": 4.472061601173631e-06, + "loss": 0.4739, + "step": 4184 + }, + { + "epoch": 2.9022191400832176, + "grad_norm": 0.3840918030935441, + "learning_rate": 4.469654488582552e-06, + "loss": 0.41, + "step": 4185 + }, + { + "epoch": 2.9029126213592233, + "grad_norm": 0.3410392120300619, + "learning_rate": 4.467247500300621e-06, + "loss": 0.4033, + "step": 4186 + }, + { + "epoch": 2.903606102635229, + "grad_norm": 0.3885996076354032, + "learning_rate": 4.464840636892015e-06, + "loss": 0.4402, + "step": 4187 + }, + { + "epoch": 2.9042995839112344, + "grad_norm": 0.34003891337217107, + "learning_rate": 4.462433898920891e-06, + "loss": 0.4049, + "step": 4188 + }, + { + "epoch": 2.9049930651872398, + "grad_norm": 0.35214490516802605, + "learning_rate": 4.460027286951366e-06, + "loss": 0.395, + "step": 4189 + }, + { + "epoch": 2.9056865464632455, + "grad_norm": 0.3784738130498375, + "learning_rate": 4.457620801547533e-06, + "loss": 0.4299, + "step": 4190 + }, + { + "epoch": 2.9063800277392513, + "grad_norm": 0.37681793237234207, + "learning_rate": 4.455214443273458e-06, + "loss": 0.4373, + "step": 4191 + }, + { + "epoch": 2.9070735090152566, + "grad_norm": 0.3883313713399433, + "learning_rate": 4.452808212693171e-06, + "loss": 0.3772, + "step": 4192 + }, + { + "epoch": 2.907766990291262, + "grad_norm": 0.37135226167438945, + "learning_rate": 4.450402110370677e-06, + "loss": 0.4374, + "step": 4193 + }, + { + "epoch": 2.9084604715672677, + "grad_norm": 0.3732731984268749, + "learning_rate": 4.447996136869948e-06, + "loss": 0.4764, + "step": 4194 + }, + { + "epoch": 2.9091539528432735, + "grad_norm": 0.36666998974634846, + "learning_rate": 4.445590292754927e-06, + "loss": 0.3919, + "step": 4195 + }, + { + "epoch": 2.909847434119279, + "grad_norm": 0.34035184473469293, + "learning_rate": 4.443184578589525e-06, + "loss": 0.4259, + "step": 4196 + }, + { + "epoch": 2.910540915395284, + "grad_norm": 0.37071498147061005, + "learning_rate": 4.440778994937625e-06, + "loss": 0.4699, + "step": 4197 + }, + { + "epoch": 2.91123439667129, + "grad_norm": 0.3685621554169873, + "learning_rate": 4.4383735423630795e-06, + "loss": 0.4372, + "step": 4198 + }, + { + "epoch": 2.9119278779472957, + "grad_norm": 0.3493022308184043, + "learning_rate": 4.435968221429706e-06, + "loss": 0.4262, + "step": 4199 + }, + { + "epoch": 2.912621359223301, + "grad_norm": 0.40893099483569534, + "learning_rate": 4.433563032701298e-06, + "loss": 0.4014, + "step": 4200 + }, + { + "epoch": 2.9133148404993063, + "grad_norm": 0.3900126872343627, + "learning_rate": 4.431157976741614e-06, + "loss": 0.4277, + "step": 4201 + }, + { + "epoch": 2.914008321775312, + "grad_norm": 0.35138144824315687, + "learning_rate": 4.428753054114379e-06, + "loss": 0.4358, + "step": 4202 + }, + { + "epoch": 2.914701803051318, + "grad_norm": 0.39868077051943723, + "learning_rate": 4.426348265383292e-06, + "loss": 0.4106, + "step": 4203 + }, + { + "epoch": 2.915395284327323, + "grad_norm": 0.4172496149807763, + "learning_rate": 4.423943611112016e-06, + "loss": 0.4277, + "step": 4204 + }, + { + "epoch": 2.9160887656033285, + "grad_norm": 0.3761372428670614, + "learning_rate": 4.421539091864187e-06, + "loss": 0.4808, + "step": 4205 + }, + { + "epoch": 2.9167822468793343, + "grad_norm": 0.5150935787749773, + "learning_rate": 4.419134708203405e-06, + "loss": 0.4495, + "step": 4206 + }, + { + "epoch": 2.91747572815534, + "grad_norm": 0.3554940766434832, + "learning_rate": 4.416730460693239e-06, + "loss": 0.4398, + "step": 4207 + }, + { + "epoch": 2.9181692094313454, + "grad_norm": 0.40458991987118614, + "learning_rate": 4.41432634989723e-06, + "loss": 0.4376, + "step": 4208 + }, + { + "epoch": 2.9188626907073507, + "grad_norm": 0.952312026438392, + "learning_rate": 4.411922376378881e-06, + "loss": 0.4473, + "step": 4209 + }, + { + "epoch": 2.9195561719833565, + "grad_norm": 0.37818496012554476, + "learning_rate": 4.409518540701671e-06, + "loss": 0.4414, + "step": 4210 + }, + { + "epoch": 2.9202496532593623, + "grad_norm": 0.4574207706914251, + "learning_rate": 4.407114843429037e-06, + "loss": 0.4541, + "step": 4211 + }, + { + "epoch": 2.9209431345353676, + "grad_norm": 0.41131225230761653, + "learning_rate": 4.40471128512439e-06, + "loss": 0.4166, + "step": 4212 + }, + { + "epoch": 2.921636615811373, + "grad_norm": 0.39276294735712264, + "learning_rate": 4.402307866351107e-06, + "loss": 0.4372, + "step": 4213 + }, + { + "epoch": 2.9223300970873787, + "grad_norm": 0.5007212598795793, + "learning_rate": 4.399904587672531e-06, + "loss": 0.4033, + "step": 4214 + }, + { + "epoch": 2.9230235783633844, + "grad_norm": 0.3601425249249112, + "learning_rate": 4.397501449651974e-06, + "loss": 0.4436, + "step": 4215 + }, + { + "epoch": 2.9237170596393898, + "grad_norm": 0.353205294677814, + "learning_rate": 4.395098452852713e-06, + "loss": 0.3889, + "step": 4216 + }, + { + "epoch": 2.924410540915395, + "grad_norm": 0.3554608716705643, + "learning_rate": 4.392695597837993e-06, + "loss": 0.4176, + "step": 4217 + }, + { + "epoch": 2.925104022191401, + "grad_norm": 0.3571090668692505, + "learning_rate": 4.3902928851710274e-06, + "loss": 0.4235, + "step": 4218 + }, + { + "epoch": 2.9257975034674066, + "grad_norm": 0.3768309658853595, + "learning_rate": 4.387890315414994e-06, + "loss": 0.444, + "step": 4219 + }, + { + "epoch": 2.926490984743412, + "grad_norm": 0.39588873298644633, + "learning_rate": 4.385487889133039e-06, + "loss": 0.4637, + "step": 4220 + }, + { + "epoch": 2.9271844660194173, + "grad_norm": 0.4117060381307611, + "learning_rate": 4.38308560688827e-06, + "loss": 0.4356, + "step": 4221 + }, + { + "epoch": 2.927877947295423, + "grad_norm": 0.3898423934610993, + "learning_rate": 4.380683469243768e-06, + "loss": 0.4008, + "step": 4222 + }, + { + "epoch": 2.928571428571429, + "grad_norm": 0.3754459880185379, + "learning_rate": 4.3782814767625755e-06, + "loss": 0.4147, + "step": 4223 + }, + { + "epoch": 2.929264909847434, + "grad_norm": 0.3544460821057703, + "learning_rate": 4.375879630007701e-06, + "loss": 0.4809, + "step": 4224 + }, + { + "epoch": 2.9299583911234395, + "grad_norm": 0.3406276961905475, + "learning_rate": 4.373477929542123e-06, + "loss": 0.4396, + "step": 4225 + }, + { + "epoch": 2.9306518723994452, + "grad_norm": 0.3400972695368736, + "learning_rate": 4.3710763759287775e-06, + "loss": 0.3578, + "step": 4226 + }, + { + "epoch": 2.931345353675451, + "grad_norm": 0.3887518661034646, + "learning_rate": 4.368674969730578e-06, + "loss": 0.4811, + "step": 4227 + }, + { + "epoch": 2.9320388349514563, + "grad_norm": 0.3882612395308509, + "learning_rate": 4.3662737115103925e-06, + "loss": 0.412, + "step": 4228 + }, + { + "epoch": 2.9327323162274617, + "grad_norm": 0.37511246445358004, + "learning_rate": 4.363872601831059e-06, + "loss": 0.4366, + "step": 4229 + }, + { + "epoch": 2.9334257975034674, + "grad_norm": 0.391641421354005, + "learning_rate": 4.36147164125538e-06, + "loss": 0.3931, + "step": 4230 + }, + { + "epoch": 2.934119278779473, + "grad_norm": 0.4146628149959727, + "learning_rate": 4.359070830346126e-06, + "loss": 0.4177, + "step": 4231 + }, + { + "epoch": 2.9348127600554785, + "grad_norm": 0.44252220270216525, + "learning_rate": 4.356670169666025e-06, + "loss": 0.4813, + "step": 4232 + }, + { + "epoch": 2.935506241331484, + "grad_norm": 0.41331992251527333, + "learning_rate": 4.354269659777779e-06, + "loss": 0.4675, + "step": 4233 + }, + { + "epoch": 2.9361997226074896, + "grad_norm": 0.37396035072667244, + "learning_rate": 4.351869301244047e-06, + "loss": 0.4077, + "step": 4234 + }, + { + "epoch": 2.9368932038834954, + "grad_norm": 0.4240076191070561, + "learning_rate": 4.349469094627456e-06, + "loss": 0.3912, + "step": 4235 + }, + { + "epoch": 2.9375866851595007, + "grad_norm": 0.36096051600234114, + "learning_rate": 4.347069040490599e-06, + "loss": 0.4727, + "step": 4236 + }, + { + "epoch": 2.938280166435506, + "grad_norm": 0.36050935676652, + "learning_rate": 4.3446691393960295e-06, + "loss": 0.4088, + "step": 4237 + }, + { + "epoch": 2.938973647711512, + "grad_norm": 0.44378486837617315, + "learning_rate": 4.342269391906269e-06, + "loss": 0.4186, + "step": 4238 + }, + { + "epoch": 2.9396671289875176, + "grad_norm": 0.4080193062495759, + "learning_rate": 4.339869798583799e-06, + "loss": 0.4389, + "step": 4239 + }, + { + "epoch": 2.940360610263523, + "grad_norm": 0.36739074697199814, + "learning_rate": 4.337470359991068e-06, + "loss": 0.461, + "step": 4240 + }, + { + "epoch": 2.9410540915395282, + "grad_norm": 0.35488358826297134, + "learning_rate": 4.335071076690484e-06, + "loss": 0.4178, + "step": 4241 + }, + { + "epoch": 2.941747572815534, + "grad_norm": 0.3505797448971799, + "learning_rate": 4.332671949244426e-06, + "loss": 0.427, + "step": 4242 + }, + { + "epoch": 2.9424410540915398, + "grad_norm": 0.3995053966433075, + "learning_rate": 4.3302729782152276e-06, + "loss": 0.4599, + "step": 4243 + }, + { + "epoch": 2.943134535367545, + "grad_norm": 0.36577894271365613, + "learning_rate": 4.327874164165195e-06, + "loss": 0.4449, + "step": 4244 + }, + { + "epoch": 2.9438280166435504, + "grad_norm": 0.3558919168561584, + "learning_rate": 4.325475507656591e-06, + "loss": 0.4086, + "step": 4245 + }, + { + "epoch": 2.944521497919556, + "grad_norm": 0.413725186535279, + "learning_rate": 4.323077009251641e-06, + "loss": 0.4415, + "step": 4246 + }, + { + "epoch": 2.945214979195562, + "grad_norm": 0.3808476733000526, + "learning_rate": 4.320678669512539e-06, + "loss": 0.4474, + "step": 4247 + }, + { + "epoch": 2.9459084604715673, + "grad_norm": 0.36358032243985366, + "learning_rate": 4.318280489001437e-06, + "loss": 0.3982, + "step": 4248 + }, + { + "epoch": 2.9466019417475726, + "grad_norm": 0.34758205590939695, + "learning_rate": 4.31588246828045e-06, + "loss": 0.3799, + "step": 4249 + }, + { + "epoch": 2.9472954230235784, + "grad_norm": 0.36320108140650526, + "learning_rate": 4.313484607911659e-06, + "loss": 0.3944, + "step": 4250 + }, + { + "epoch": 2.947988904299584, + "grad_norm": 0.3685465916452843, + "learning_rate": 4.3110869084571035e-06, + "loss": 0.4189, + "step": 4251 + }, + { + "epoch": 2.9486823855755895, + "grad_norm": 0.38986392855565566, + "learning_rate": 4.3086893704787855e-06, + "loss": 0.3935, + "step": 4252 + }, + { + "epoch": 2.949375866851595, + "grad_norm": 0.3909559608756658, + "learning_rate": 4.306291994538674e-06, + "loss": 0.4492, + "step": 4253 + }, + { + "epoch": 2.9500693481276006, + "grad_norm": 0.3616514085155487, + "learning_rate": 4.3038947811986945e-06, + "loss": 0.3869, + "step": 4254 + }, + { + "epoch": 2.9507628294036063, + "grad_norm": 0.35377823474419373, + "learning_rate": 4.3014977310207385e-06, + "loss": 0.4333, + "step": 4255 + }, + { + "epoch": 2.9514563106796117, + "grad_norm": 0.36875846519045613, + "learning_rate": 4.299100844566654e-06, + "loss": 0.4088, + "step": 4256 + }, + { + "epoch": 2.952149791955617, + "grad_norm": 0.3822074991593694, + "learning_rate": 4.296704122398256e-06, + "loss": 0.4219, + "step": 4257 + }, + { + "epoch": 2.9528432732316228, + "grad_norm": 0.3558894690283381, + "learning_rate": 4.294307565077318e-06, + "loss": 0.43, + "step": 4258 + }, + { + "epoch": 2.9535367545076285, + "grad_norm": 0.3758378590592379, + "learning_rate": 4.2919111731655764e-06, + "loss": 0.4515, + "step": 4259 + }, + { + "epoch": 2.954230235783634, + "grad_norm": 0.3651968594830969, + "learning_rate": 4.2895149472247275e-06, + "loss": 0.4407, + "step": 4260 + }, + { + "epoch": 2.954923717059639, + "grad_norm": 0.37824989311907037, + "learning_rate": 4.2871188878164275e-06, + "loss": 0.408, + "step": 4261 + }, + { + "epoch": 2.955617198335645, + "grad_norm": 0.3616979418948012, + "learning_rate": 4.284722995502298e-06, + "loss": 0.3892, + "step": 4262 + }, + { + "epoch": 2.9563106796116507, + "grad_norm": 0.37057440961879723, + "learning_rate": 4.282327270843919e-06, + "loss": 0.3993, + "step": 4263 + }, + { + "epoch": 2.957004160887656, + "grad_norm": 0.393727943353917, + "learning_rate": 4.27993171440283e-06, + "loss": 0.3983, + "step": 4264 + }, + { + "epoch": 2.9576976421636614, + "grad_norm": 0.33098126965279817, + "learning_rate": 4.277536326740532e-06, + "loss": 0.4278, + "step": 4265 + }, + { + "epoch": 2.958391123439667, + "grad_norm": 0.35456096910736296, + "learning_rate": 4.275141108418487e-06, + "loss": 0.406, + "step": 4266 + }, + { + "epoch": 2.959084604715673, + "grad_norm": 0.3857375998666261, + "learning_rate": 4.272746059998117e-06, + "loss": 0.4548, + "step": 4267 + }, + { + "epoch": 2.9597780859916782, + "grad_norm": 0.3443289102946537, + "learning_rate": 4.270351182040802e-06, + "loss": 0.414, + "step": 4268 + }, + { + "epoch": 2.9604715672676836, + "grad_norm": 0.402873190215866, + "learning_rate": 4.267956475107886e-06, + "loss": 0.4114, + "step": 4269 + }, + { + "epoch": 2.9611650485436893, + "grad_norm": 0.48112136404907163, + "learning_rate": 4.265561939760671e-06, + "loss": 0.4445, + "step": 4270 + }, + { + "epoch": 2.961858529819695, + "grad_norm": 0.5064996632471026, + "learning_rate": 4.263167576560417e-06, + "loss": 0.4009, + "step": 4271 + }, + { + "epoch": 2.9625520110957004, + "grad_norm": 0.368407837281134, + "learning_rate": 4.2607733860683485e-06, + "loss": 0.4447, + "step": 4272 + }, + { + "epoch": 2.9632454923717058, + "grad_norm": 0.3543295548768013, + "learning_rate": 4.258379368845644e-06, + "loss": 0.4133, + "step": 4273 + }, + { + "epoch": 2.9639389736477115, + "grad_norm": 0.3528573723218785, + "learning_rate": 4.255985525453443e-06, + "loss": 0.394, + "step": 4274 + }, + { + "epoch": 2.9646324549237173, + "grad_norm": 0.389851665848564, + "learning_rate": 4.253591856452849e-06, + "loss": 0.4113, + "step": 4275 + }, + { + "epoch": 2.9653259361997226, + "grad_norm": 0.49549636672036323, + "learning_rate": 4.251198362404917e-06, + "loss": 0.4479, + "step": 4276 + }, + { + "epoch": 2.966019417475728, + "grad_norm": 0.508548139600559, + "learning_rate": 4.248805043870665e-06, + "loss": 0.5098, + "step": 4277 + }, + { + "epoch": 2.9667128987517337, + "grad_norm": 0.42703095776247146, + "learning_rate": 4.246411901411071e-06, + "loss": 0.4378, + "step": 4278 + }, + { + "epoch": 2.9674063800277395, + "grad_norm": 0.4660796201150934, + "learning_rate": 4.244018935587068e-06, + "loss": 0.4465, + "step": 4279 + }, + { + "epoch": 2.968099861303745, + "grad_norm": 0.3667537562587972, + "learning_rate": 4.241626146959553e-06, + "loss": 0.4175, + "step": 4280 + }, + { + "epoch": 2.96879334257975, + "grad_norm": 0.41178798121954824, + "learning_rate": 4.239233536089377e-06, + "loss": 0.4339, + "step": 4281 + }, + { + "epoch": 2.969486823855756, + "grad_norm": 0.41827651139444094, + "learning_rate": 4.236841103537349e-06, + "loss": 0.4153, + "step": 4282 + }, + { + "epoch": 2.9701803051317617, + "grad_norm": 0.3524789326648256, + "learning_rate": 4.234448849864241e-06, + "loss": 0.4012, + "step": 4283 + }, + { + "epoch": 2.970873786407767, + "grad_norm": 0.3816231220806304, + "learning_rate": 4.232056775630778e-06, + "loss": 0.4134, + "step": 4284 + }, + { + "epoch": 2.9715672676837723, + "grad_norm": 0.36447400401810076, + "learning_rate": 4.229664881397645e-06, + "loss": 0.4423, + "step": 4285 + }, + { + "epoch": 2.972260748959778, + "grad_norm": 0.3225440930529132, + "learning_rate": 4.227273167725484e-06, + "loss": 0.4161, + "step": 4286 + }, + { + "epoch": 2.972954230235784, + "grad_norm": 0.34130846687034655, + "learning_rate": 4.224881635174897e-06, + "loss": 0.4376, + "step": 4287 + }, + { + "epoch": 2.973647711511789, + "grad_norm": 0.34847053691640917, + "learning_rate": 4.2224902843064384e-06, + "loss": 0.3965, + "step": 4288 + }, + { + "epoch": 2.9743411927877945, + "grad_norm": 0.35125933384185115, + "learning_rate": 4.220099115680628e-06, + "loss": 0.4568, + "step": 4289 + }, + { + "epoch": 2.9750346740638003, + "grad_norm": 0.3585432803771428, + "learning_rate": 4.217708129857937e-06, + "loss": 0.3907, + "step": 4290 + }, + { + "epoch": 2.975728155339806, + "grad_norm": 0.36571877662928043, + "learning_rate": 4.215317327398795e-06, + "loss": 0.4425, + "step": 4291 + }, + { + "epoch": 2.9764216366158114, + "grad_norm": 0.7306906625225886, + "learning_rate": 4.212926708863588e-06, + "loss": 0.4324, + "step": 4292 + }, + { + "epoch": 2.9771151178918167, + "grad_norm": 0.36069675711693266, + "learning_rate": 4.210536274812661e-06, + "loss": 0.3597, + "step": 4293 + }, + { + "epoch": 2.9778085991678225, + "grad_norm": 0.43992143936926636, + "learning_rate": 4.208146025806313e-06, + "loss": 0.4402, + "step": 4294 + }, + { + "epoch": 2.9785020804438282, + "grad_norm": 0.3551426560349192, + "learning_rate": 4.205755962404801e-06, + "loss": 0.3753, + "step": 4295 + }, + { + "epoch": 2.9791955617198336, + "grad_norm": 0.33705470699106477, + "learning_rate": 4.20336608516834e-06, + "loss": 0.3778, + "step": 4296 + }, + { + "epoch": 2.979889042995839, + "grad_norm": 0.3875134793699313, + "learning_rate": 4.200976394657098e-06, + "loss": 0.4325, + "step": 4297 + }, + { + "epoch": 2.9805825242718447, + "grad_norm": 0.3615281724838666, + "learning_rate": 4.198586891431203e-06, + "loss": 0.432, + "step": 4298 + }, + { + "epoch": 2.9812760055478504, + "grad_norm": 0.41095565863060013, + "learning_rate": 4.196197576050737e-06, + "loss": 0.4523, + "step": 4299 + }, + { + "epoch": 2.9819694868238558, + "grad_norm": 0.4682817793802189, + "learning_rate": 4.1938084490757375e-06, + "loss": 0.4717, + "step": 4300 + }, + { + "epoch": 2.982662968099861, + "grad_norm": 0.3563769249355266, + "learning_rate": 4.191419511066199e-06, + "loss": 0.4351, + "step": 4301 + }, + { + "epoch": 2.983356449375867, + "grad_norm": 0.4302142439490134, + "learning_rate": 4.1890307625820705e-06, + "loss": 0.4829, + "step": 4302 + }, + { + "epoch": 2.9840499306518726, + "grad_norm": 0.37469363036699777, + "learning_rate": 4.186642204183259e-06, + "loss": 0.4627, + "step": 4303 + }, + { + "epoch": 2.984743411927878, + "grad_norm": 0.4031889250698094, + "learning_rate": 4.184253836429624e-06, + "loss": 0.4348, + "step": 4304 + }, + { + "epoch": 2.9854368932038833, + "grad_norm": 0.35424813351448564, + "learning_rate": 4.181865659880982e-06, + "loss": 0.4111, + "step": 4305 + }, + { + "epoch": 2.986130374479889, + "grad_norm": 0.34422476739029473, + "learning_rate": 4.179477675097102e-06, + "loss": 0.3848, + "step": 4306 + }, + { + "epoch": 2.986823855755895, + "grad_norm": 0.39324852193346244, + "learning_rate": 4.177089882637713e-06, + "loss": 0.454, + "step": 4307 + }, + { + "epoch": 2.9875173370319, + "grad_norm": 0.36175083718314827, + "learning_rate": 4.174702283062497e-06, + "loss": 0.4316, + "step": 4308 + }, + { + "epoch": 2.9882108183079055, + "grad_norm": 0.3511453289481942, + "learning_rate": 4.172314876931089e-06, + "loss": 0.3857, + "step": 4309 + }, + { + "epoch": 2.9889042995839112, + "grad_norm": 0.3534191191212724, + "learning_rate": 4.1699276648030805e-06, + "loss": 0.4012, + "step": 4310 + }, + { + "epoch": 2.989597780859917, + "grad_norm": 0.38250423048380033, + "learning_rate": 4.167540647238013e-06, + "loss": 0.4212, + "step": 4311 + }, + { + "epoch": 2.9902912621359223, + "grad_norm": 0.35440520217564475, + "learning_rate": 4.165153824795391e-06, + "loss": 0.4413, + "step": 4312 + }, + { + "epoch": 2.9909847434119277, + "grad_norm": 0.35663598743105585, + "learning_rate": 4.162767198034665e-06, + "loss": 0.4371, + "step": 4313 + }, + { + "epoch": 2.9916782246879334, + "grad_norm": 0.38788743180077906, + "learning_rate": 4.1603807675152444e-06, + "loss": 0.4355, + "step": 4314 + }, + { + "epoch": 2.992371705963939, + "grad_norm": 0.4110449964082554, + "learning_rate": 4.15799453379649e-06, + "loss": 0.4587, + "step": 4315 + }, + { + "epoch": 2.9930651872399445, + "grad_norm": 0.3820571059332362, + "learning_rate": 4.15560849743772e-06, + "loss": 0.4319, + "step": 4316 + }, + { + "epoch": 2.99375866851595, + "grad_norm": 0.35652755584557944, + "learning_rate": 4.153222658998203e-06, + "loss": 0.4386, + "step": 4317 + }, + { + "epoch": 2.9944521497919556, + "grad_norm": 0.38678005121238507, + "learning_rate": 4.1508370190371626e-06, + "loss": 0.3998, + "step": 4318 + }, + { + "epoch": 2.9951456310679614, + "grad_norm": 0.38066917572403497, + "learning_rate": 4.148451578113773e-06, + "loss": 0.4128, + "step": 4319 + }, + { + "epoch": 2.9958391123439667, + "grad_norm": 0.38007142513719117, + "learning_rate": 4.146066336787169e-06, + "loss": 0.4394, + "step": 4320 + }, + { + "epoch": 2.996532593619972, + "grad_norm": 0.4126330063215766, + "learning_rate": 4.143681295616429e-06, + "loss": 0.4262, + "step": 4321 + }, + { + "epoch": 2.997226074895978, + "grad_norm": 0.38827172282440003, + "learning_rate": 4.141296455160592e-06, + "loss": 0.4048, + "step": 4322 + }, + { + "epoch": 2.9979195561719836, + "grad_norm": 0.38603404511784284, + "learning_rate": 4.138911815978648e-06, + "loss": 0.4587, + "step": 4323 + }, + { + "epoch": 2.998613037447989, + "grad_norm": 0.4406539070299765, + "learning_rate": 4.136527378629535e-06, + "loss": 0.4192, + "step": 4324 + }, + { + "epoch": 2.9993065187239942, + "grad_norm": 0.391010000823013, + "learning_rate": 4.134143143672154e-06, + "loss": 0.4233, + "step": 4325 + }, + { + "epoch": 3.0, + "grad_norm": 0.4101991000166011, + "learning_rate": 4.131759111665349e-06, + "loss": 0.4465, + "step": 4326 + }, + { + "epoch": 3.0006934812760058, + "grad_norm": 0.3719559775138811, + "learning_rate": 4.129375283167919e-06, + "loss": 0.3685, + "step": 4327 + }, + { + "epoch": 3.001386962552011, + "grad_norm": 0.3654641212889352, + "learning_rate": 4.126991658738618e-06, + "loss": 0.3695, + "step": 4328 + }, + { + "epoch": 3.002080443828017, + "grad_norm": 0.3789960837522673, + "learning_rate": 4.12460823893615e-06, + "loss": 0.4001, + "step": 4329 + }, + { + "epoch": 3.002773925104022, + "grad_norm": 0.3691564243155371, + "learning_rate": 4.122225024319171e-06, + "loss": 0.3851, + "step": 4330 + }, + { + "epoch": 3.003467406380028, + "grad_norm": 0.4072313138856476, + "learning_rate": 4.119842015446288e-06, + "loss": 0.4315, + "step": 4331 + }, + { + "epoch": 3.0041608876560333, + "grad_norm": 0.347853917127619, + "learning_rate": 4.117459212876062e-06, + "loss": 0.3919, + "step": 4332 + }, + { + "epoch": 3.004854368932039, + "grad_norm": 0.38506692871083675, + "learning_rate": 4.115076617167004e-06, + "loss": 0.3465, + "step": 4333 + }, + { + "epoch": 3.0055478502080444, + "grad_norm": 0.3748928420469291, + "learning_rate": 4.11269422887758e-06, + "loss": 0.4176, + "step": 4334 + }, + { + "epoch": 3.00624133148405, + "grad_norm": 0.3675701455427544, + "learning_rate": 4.110312048566203e-06, + "loss": 0.3776, + "step": 4335 + }, + { + "epoch": 3.0069348127600555, + "grad_norm": 0.3881110998526009, + "learning_rate": 4.107930076791237e-06, + "loss": 0.3807, + "step": 4336 + }, + { + "epoch": 3.0076282940360612, + "grad_norm": 0.43616785225233196, + "learning_rate": 4.105548314111001e-06, + "loss": 0.384, + "step": 4337 + }, + { + "epoch": 3.0083217753120666, + "grad_norm": 0.40718749495585393, + "learning_rate": 4.103166761083762e-06, + "loss": 0.3954, + "step": 4338 + }, + { + "epoch": 3.0090152565880723, + "grad_norm": 0.4001664690303196, + "learning_rate": 4.100785418267739e-06, + "loss": 0.3849, + "step": 4339 + }, + { + "epoch": 3.0097087378640777, + "grad_norm": 0.39387857196310244, + "learning_rate": 4.098404286221102e-06, + "loss": 0.3454, + "step": 4340 + }, + { + "epoch": 3.0104022191400834, + "grad_norm": 0.3825724846315654, + "learning_rate": 4.0960233655019706e-06, + "loss": 0.3866, + "step": 4341 + }, + { + "epoch": 3.0110957004160888, + "grad_norm": 0.44220586162762765, + "learning_rate": 4.093642656668414e-06, + "loss": 0.3636, + "step": 4342 + }, + { + "epoch": 3.0117891816920945, + "grad_norm": 0.3632444781743847, + "learning_rate": 4.091262160278455e-06, + "loss": 0.3515, + "step": 4343 + }, + { + "epoch": 3.0124826629681, + "grad_norm": 0.34085402395558856, + "learning_rate": 4.088881876890065e-06, + "loss": 0.3301, + "step": 4344 + }, + { + "epoch": 3.0131761442441056, + "grad_norm": 0.36949316661043635, + "learning_rate": 4.086501807061164e-06, + "loss": 0.3559, + "step": 4345 + }, + { + "epoch": 3.013869625520111, + "grad_norm": 0.3529786631931451, + "learning_rate": 4.084121951349625e-06, + "loss": 0.3885, + "step": 4346 + }, + { + "epoch": 3.0145631067961167, + "grad_norm": 0.3770742460761115, + "learning_rate": 4.081742310313266e-06, + "loss": 0.3687, + "step": 4347 + }, + { + "epoch": 3.015256588072122, + "grad_norm": 0.37380731725930166, + "learning_rate": 4.0793628845098595e-06, + "loss": 0.3798, + "step": 4348 + }, + { + "epoch": 3.015950069348128, + "grad_norm": 0.38757118955990333, + "learning_rate": 4.076983674497125e-06, + "loss": 0.3713, + "step": 4349 + }, + { + "epoch": 3.016643550624133, + "grad_norm": 0.5098215009078929, + "learning_rate": 4.074604680832733e-06, + "loss": 0.3833, + "step": 4350 + }, + { + "epoch": 3.017337031900139, + "grad_norm": 0.36314231069913294, + "learning_rate": 4.0722259040743e-06, + "loss": 0.3756, + "step": 4351 + }, + { + "epoch": 3.0180305131761442, + "grad_norm": 0.3538191891435209, + "learning_rate": 4.069847344779397e-06, + "loss": 0.3172, + "step": 4352 + }, + { + "epoch": 3.01872399445215, + "grad_norm": 0.5568284681540532, + "learning_rate": 4.0674690035055405e-06, + "loss": 0.4167, + "step": 4353 + }, + { + "epoch": 3.0194174757281553, + "grad_norm": 0.42335869469240806, + "learning_rate": 4.0650908808101965e-06, + "loss": 0.3746, + "step": 4354 + }, + { + "epoch": 3.020110957004161, + "grad_norm": 0.37612096423362434, + "learning_rate": 4.0627129772507785e-06, + "loss": 0.3401, + "step": 4355 + }, + { + "epoch": 3.0208044382801664, + "grad_norm": 0.36977584141068426, + "learning_rate": 4.0603352933846494e-06, + "loss": 0.342, + "step": 4356 + }, + { + "epoch": 3.021497919556172, + "grad_norm": 0.3932491615188737, + "learning_rate": 4.057957829769123e-06, + "loss": 0.3772, + "step": 4357 + }, + { + "epoch": 3.0221914008321775, + "grad_norm": 0.4121961084134789, + "learning_rate": 4.05558058696146e-06, + "loss": 0.393, + "step": 4358 + }, + { + "epoch": 3.0228848821081833, + "grad_norm": 0.48445694047534166, + "learning_rate": 4.053203565518865e-06, + "loss": 0.4083, + "step": 4359 + }, + { + "epoch": 3.0235783633841886, + "grad_norm": 0.3656478121904701, + "learning_rate": 4.0508267659984975e-06, + "loss": 0.3668, + "step": 4360 + }, + { + "epoch": 3.0242718446601944, + "grad_norm": 0.40094960423949705, + "learning_rate": 4.048450188957462e-06, + "loss": 0.3851, + "step": 4361 + }, + { + "epoch": 3.0249653259361997, + "grad_norm": 0.40240583829137166, + "learning_rate": 4.046073834952812e-06, + "loss": 0.3697, + "step": 4362 + }, + { + "epoch": 3.0256588072122055, + "grad_norm": 0.43912272555439524, + "learning_rate": 4.043697704541546e-06, + "loss": 0.4239, + "step": 4363 + }, + { + "epoch": 3.026352288488211, + "grad_norm": 0.3720974632416348, + "learning_rate": 4.041321798280612e-06, + "loss": 0.3764, + "step": 4364 + }, + { + "epoch": 3.0270457697642166, + "grad_norm": 0.45346617219474145, + "learning_rate": 4.038946116726906e-06, + "loss": 0.3576, + "step": 4365 + }, + { + "epoch": 3.027739251040222, + "grad_norm": 0.34260736533784614, + "learning_rate": 4.03657066043727e-06, + "loss": 0.367, + "step": 4366 + }, + { + "epoch": 3.0284327323162277, + "grad_norm": 0.39723099833615794, + "learning_rate": 4.034195429968494e-06, + "loss": 0.4182, + "step": 4367 + }, + { + "epoch": 3.029126213592233, + "grad_norm": 0.3666378531436607, + "learning_rate": 4.031820425877313e-06, + "loss": 0.3314, + "step": 4368 + }, + { + "epoch": 3.0298196948682388, + "grad_norm": 0.3558768237219755, + "learning_rate": 4.029445648720411e-06, + "loss": 0.3576, + "step": 4369 + }, + { + "epoch": 3.030513176144244, + "grad_norm": 0.40215255007578005, + "learning_rate": 4.027071099054423e-06, + "loss": 0.3855, + "step": 4370 + }, + { + "epoch": 3.03120665742025, + "grad_norm": 0.45793754512393064, + "learning_rate": 4.024696777435922e-06, + "loss": 0.4424, + "step": 4371 + }, + { + "epoch": 3.031900138696255, + "grad_norm": 0.37122390252052, + "learning_rate": 4.022322684421432e-06, + "loss": 0.369, + "step": 4372 + }, + { + "epoch": 3.032593619972261, + "grad_norm": 0.3784314387255944, + "learning_rate": 4.0199488205674256e-06, + "loss": 0.377, + "step": 4373 + }, + { + "epoch": 3.0332871012482663, + "grad_norm": 0.3902191024416504, + "learning_rate": 4.017575186430318e-06, + "loss": 0.3903, + "step": 4374 + }, + { + "epoch": 3.033980582524272, + "grad_norm": 0.5406084682555689, + "learning_rate": 4.015201782566471e-06, + "loss": 0.3748, + "step": 4375 + }, + { + "epoch": 3.0346740638002774, + "grad_norm": 0.399094481947949, + "learning_rate": 4.012828609532193e-06, + "loss": 0.3856, + "step": 4376 + }, + { + "epoch": 3.035367545076283, + "grad_norm": 0.4238528346551883, + "learning_rate": 4.010455667883741e-06, + "loss": 0.3824, + "step": 4377 + }, + { + "epoch": 3.0360610263522885, + "grad_norm": 0.8500650484267042, + "learning_rate": 4.008082958177311e-06, + "loss": 0.3648, + "step": 4378 + }, + { + "epoch": 3.0367545076282942, + "grad_norm": 0.39045213922371413, + "learning_rate": 4.005710480969055e-06, + "loss": 0.3606, + "step": 4379 + }, + { + "epoch": 3.0374479889042996, + "grad_norm": 0.4233018574190896, + "learning_rate": 4.0033382368150605e-06, + "loss": 0.4164, + "step": 4380 + }, + { + "epoch": 3.0381414701803053, + "grad_norm": 0.4066581532418864, + "learning_rate": 4.0009662262713635e-06, + "loss": 0.3974, + "step": 4381 + }, + { + "epoch": 3.0388349514563107, + "grad_norm": 0.46212936930378745, + "learning_rate": 3.99859444989395e-06, + "loss": 0.4105, + "step": 4382 + }, + { + "epoch": 3.0395284327323164, + "grad_norm": 0.4187223555933156, + "learning_rate": 3.996222908238744e-06, + "loss": 0.3873, + "step": 4383 + }, + { + "epoch": 3.0402219140083218, + "grad_norm": 0.3788092232501361, + "learning_rate": 3.993851601861618e-06, + "loss": 0.3761, + "step": 4384 + }, + { + "epoch": 3.0409153952843275, + "grad_norm": 0.3945015512020017, + "learning_rate": 3.991480531318391e-06, + "loss": 0.4047, + "step": 4385 + }, + { + "epoch": 3.041608876560333, + "grad_norm": 0.3762729893257261, + "learning_rate": 3.989109697164823e-06, + "loss": 0.3458, + "step": 4386 + }, + { + "epoch": 3.0423023578363386, + "grad_norm": 0.3798084301011424, + "learning_rate": 3.986739099956619e-06, + "loss": 0.3858, + "step": 4387 + }, + { + "epoch": 3.042995839112344, + "grad_norm": 0.3815554014462042, + "learning_rate": 3.984368740249433e-06, + "loss": 0.3798, + "step": 4388 + }, + { + "epoch": 3.0436893203883497, + "grad_norm": 0.4605481225074819, + "learning_rate": 3.981998618598858e-06, + "loss": 0.4124, + "step": 4389 + }, + { + "epoch": 3.044382801664355, + "grad_norm": 0.36333817698223786, + "learning_rate": 3.979628735560436e-06, + "loss": 0.3574, + "step": 4390 + }, + { + "epoch": 3.045076282940361, + "grad_norm": 0.3967177621347652, + "learning_rate": 3.9772590916896466e-06, + "loss": 0.3639, + "step": 4391 + }, + { + "epoch": 3.045769764216366, + "grad_norm": 0.3705992323429345, + "learning_rate": 3.974889687541921e-06, + "loss": 0.3781, + "step": 4392 + }, + { + "epoch": 3.046463245492372, + "grad_norm": 0.3519964447329601, + "learning_rate": 3.972520523672627e-06, + "loss": 0.3878, + "step": 4393 + }, + { + "epoch": 3.0471567267683772, + "grad_norm": 0.41492051483474657, + "learning_rate": 3.970151600637081e-06, + "loss": 0.4385, + "step": 4394 + }, + { + "epoch": 3.047850208044383, + "grad_norm": 0.39278228628621326, + "learning_rate": 3.967782918990542e-06, + "loss": 0.4078, + "step": 4395 + }, + { + "epoch": 3.0485436893203883, + "grad_norm": 0.40295121425496927, + "learning_rate": 3.965414479288209e-06, + "loss": 0.3569, + "step": 4396 + }, + { + "epoch": 3.049237170596394, + "grad_norm": 0.37943102657769573, + "learning_rate": 3.96304628208523e-06, + "loss": 0.381, + "step": 4397 + }, + { + "epoch": 3.0499306518723994, + "grad_norm": 0.3449115428865669, + "learning_rate": 3.960678327936693e-06, + "loss": 0.3368, + "step": 4398 + }, + { + "epoch": 3.050624133148405, + "grad_norm": 0.38458165780369025, + "learning_rate": 3.95831061739763e-06, + "loss": 0.3573, + "step": 4399 + }, + { + "epoch": 3.0513176144244105, + "grad_norm": 0.42200252453603304, + "learning_rate": 3.955943151023014e-06, + "loss": 0.3866, + "step": 4400 + }, + { + "epoch": 3.0520110957004163, + "grad_norm": 0.3748800788726624, + "learning_rate": 3.95357592936776e-06, + "loss": 0.3662, + "step": 4401 + }, + { + "epoch": 3.0527045769764216, + "grad_norm": 0.4440741913042754, + "learning_rate": 3.951208952986731e-06, + "loss": 0.3889, + "step": 4402 + }, + { + "epoch": 3.0533980582524274, + "grad_norm": 0.7173710109753139, + "learning_rate": 3.948842222434728e-06, + "loss": 0.3632, + "step": 4403 + }, + { + "epoch": 3.0540915395284327, + "grad_norm": 0.40818919279684757, + "learning_rate": 3.9464757382664945e-06, + "loss": 0.4191, + "step": 4404 + }, + { + "epoch": 3.0547850208044385, + "grad_norm": 0.3904329790032371, + "learning_rate": 3.944109501036717e-06, + "loss": 0.407, + "step": 4405 + }, + { + "epoch": 3.055478502080444, + "grad_norm": 0.40606757353672723, + "learning_rate": 3.941743511300026e-06, + "loss": 0.359, + "step": 4406 + }, + { + "epoch": 3.0561719833564496, + "grad_norm": 0.6589194010129817, + "learning_rate": 3.939377769610993e-06, + "loss": 0.4049, + "step": 4407 + }, + { + "epoch": 3.056865464632455, + "grad_norm": 0.3872819681503515, + "learning_rate": 3.9370122765241285e-06, + "loss": 0.3978, + "step": 4408 + }, + { + "epoch": 3.0575589459084607, + "grad_norm": 0.37702985940007444, + "learning_rate": 3.934647032593888e-06, + "loss": 0.3765, + "step": 4409 + }, + { + "epoch": 3.058252427184466, + "grad_norm": 0.3687171886954747, + "learning_rate": 3.932282038374667e-06, + "loss": 0.3437, + "step": 4410 + }, + { + "epoch": 3.0589459084604718, + "grad_norm": 0.38087865965786877, + "learning_rate": 3.929917294420804e-06, + "loss": 0.3966, + "step": 4411 + }, + { + "epoch": 3.059639389736477, + "grad_norm": 0.4568339837713104, + "learning_rate": 3.927552801286578e-06, + "loss": 0.3811, + "step": 4412 + }, + { + "epoch": 3.060332871012483, + "grad_norm": 0.4977526444265689, + "learning_rate": 3.925188559526207e-06, + "loss": 0.3698, + "step": 4413 + }, + { + "epoch": 3.061026352288488, + "grad_norm": 0.403868775544694, + "learning_rate": 3.922824569693852e-06, + "loss": 0.3826, + "step": 4414 + }, + { + "epoch": 3.061719833564494, + "grad_norm": 0.3637143719338198, + "learning_rate": 3.920460832343619e-06, + "loss": 0.3403, + "step": 4415 + }, + { + "epoch": 3.0624133148404993, + "grad_norm": 0.38143344933604256, + "learning_rate": 3.918097348029548e-06, + "loss": 0.3652, + "step": 4416 + }, + { + "epoch": 3.063106796116505, + "grad_norm": 0.38097375495634567, + "learning_rate": 3.915734117305624e-06, + "loss": 0.425, + "step": 4417 + }, + { + "epoch": 3.0638002773925104, + "grad_norm": 0.3758435975327286, + "learning_rate": 3.913371140725769e-06, + "loss": 0.402, + "step": 4418 + }, + { + "epoch": 3.064493758668516, + "grad_norm": 0.43236393960817493, + "learning_rate": 3.911008418843849e-06, + "loss": 0.4165, + "step": 4419 + }, + { + "epoch": 3.0651872399445215, + "grad_norm": 0.46155058807197424, + "learning_rate": 3.90864595221367e-06, + "loss": 0.3354, + "step": 4420 + }, + { + "epoch": 3.0658807212205272, + "grad_norm": 0.3912992436195119, + "learning_rate": 3.906283741388974e-06, + "loss": 0.361, + "step": 4421 + }, + { + "epoch": 3.0665742024965326, + "grad_norm": 0.33556236693293084, + "learning_rate": 3.903921786923447e-06, + "loss": 0.3226, + "step": 4422 + }, + { + "epoch": 3.0672676837725383, + "grad_norm": 0.374569138102664, + "learning_rate": 3.901560089370717e-06, + "loss": 0.3496, + "step": 4423 + }, + { + "epoch": 3.0679611650485437, + "grad_norm": 0.38838078368995566, + "learning_rate": 3.899198649284348e-06, + "loss": 0.379, + "step": 4424 + }, + { + "epoch": 3.0686546463245494, + "grad_norm": 0.44914310878085484, + "learning_rate": 3.896837467217842e-06, + "loss": 0.3542, + "step": 4425 + }, + { + "epoch": 3.0693481276005548, + "grad_norm": 0.3807961890905395, + "learning_rate": 3.894476543724643e-06, + "loss": 0.3576, + "step": 4426 + }, + { + "epoch": 3.0700416088765605, + "grad_norm": 0.372374503023027, + "learning_rate": 3.8921158793581375e-06, + "loss": 0.3156, + "step": 4427 + }, + { + "epoch": 3.070735090152566, + "grad_norm": 0.40164708871920485, + "learning_rate": 3.889755474671645e-06, + "loss": 0.3544, + "step": 4428 + }, + { + "epoch": 3.0714285714285716, + "grad_norm": 0.38329865374235583, + "learning_rate": 3.887395330218429e-06, + "loss": 0.3704, + "step": 4429 + }, + { + "epoch": 3.072122052704577, + "grad_norm": 0.3604012596897037, + "learning_rate": 3.88503544655169e-06, + "loss": 0.3635, + "step": 4430 + }, + { + "epoch": 3.0728155339805827, + "grad_norm": 0.36037728141675845, + "learning_rate": 3.882675824224565e-06, + "loss": 0.3449, + "step": 4431 + }, + { + "epoch": 3.073509015256588, + "grad_norm": 0.5244135197845126, + "learning_rate": 3.880316463790137e-06, + "loss": 0.3926, + "step": 4432 + }, + { + "epoch": 3.074202496532594, + "grad_norm": 0.4424754702166302, + "learning_rate": 3.8779573658014204e-06, + "loss": 0.4037, + "step": 4433 + }, + { + "epoch": 3.074895977808599, + "grad_norm": 0.4151757387406011, + "learning_rate": 3.8755985308113705e-06, + "loss": 0.3547, + "step": 4434 + }, + { + "epoch": 3.075589459084605, + "grad_norm": 0.4122879892622112, + "learning_rate": 3.873239959372883e-06, + "loss": 0.3903, + "step": 4435 + }, + { + "epoch": 3.0762829403606102, + "grad_norm": 1.2072034955373634, + "learning_rate": 3.870881652038788e-06, + "loss": 0.362, + "step": 4436 + }, + { + "epoch": 3.076976421636616, + "grad_norm": 0.4145341816521233, + "learning_rate": 3.8685236093618574e-06, + "loss": 0.3455, + "step": 4437 + }, + { + "epoch": 3.0776699029126213, + "grad_norm": 0.4269189459427349, + "learning_rate": 3.866165831894796e-06, + "loss": 0.4168, + "step": 4438 + }, + { + "epoch": 3.078363384188627, + "grad_norm": 0.39122307033715725, + "learning_rate": 3.863808320190254e-06, + "loss": 0.3492, + "step": 4439 + }, + { + "epoch": 3.0790568654646324, + "grad_norm": 0.37944658350183813, + "learning_rate": 3.861451074800809e-06, + "loss": 0.3621, + "step": 4440 + }, + { + "epoch": 3.079750346740638, + "grad_norm": 0.4260255566730035, + "learning_rate": 3.85909409627899e-06, + "loss": 0.402, + "step": 4441 + }, + { + "epoch": 3.0804438280166435, + "grad_norm": 0.40241700719531964, + "learning_rate": 3.856737385177252e-06, + "loss": 0.3976, + "step": 4442 + }, + { + "epoch": 3.0811373092926493, + "grad_norm": 0.41342816133891147, + "learning_rate": 3.85438094204799e-06, + "loss": 0.4092, + "step": 4443 + }, + { + "epoch": 3.0818307905686546, + "grad_norm": 0.36065332055095284, + "learning_rate": 3.852024767443539e-06, + "loss": 0.3479, + "step": 4444 + }, + { + "epoch": 3.0825242718446604, + "grad_norm": 0.4019472444148632, + "learning_rate": 3.849668861916169e-06, + "loss": 0.4026, + "step": 4445 + }, + { + "epoch": 3.0832177531206657, + "grad_norm": 0.37167539357588597, + "learning_rate": 3.847313226018085e-06, + "loss": 0.4287, + "step": 4446 + }, + { + "epoch": 3.0839112343966715, + "grad_norm": 0.4102882834671503, + "learning_rate": 3.844957860301434e-06, + "loss": 0.3743, + "step": 4447 + }, + { + "epoch": 3.084604715672677, + "grad_norm": 0.4743165595528586, + "learning_rate": 3.8426027653182955e-06, + "loss": 0.3825, + "step": 4448 + }, + { + "epoch": 3.0852981969486826, + "grad_norm": 0.4946917617137666, + "learning_rate": 3.840247941620683e-06, + "loss": 0.3436, + "step": 4449 + }, + { + "epoch": 3.085991678224688, + "grad_norm": 0.42854191436665223, + "learning_rate": 3.8378933897605574e-06, + "loss": 0.3683, + "step": 4450 + }, + { + "epoch": 3.0866851595006937, + "grad_norm": 0.4163173289963767, + "learning_rate": 3.835539110289804e-06, + "loss": 0.3881, + "step": 4451 + }, + { + "epoch": 3.087378640776699, + "grad_norm": 0.38090885153552223, + "learning_rate": 3.83318510376025e-06, + "loss": 0.3892, + "step": 4452 + }, + { + "epoch": 3.0880721220527048, + "grad_norm": 0.7474026311641647, + "learning_rate": 3.8308313707236566e-06, + "loss": 0.3915, + "step": 4453 + }, + { + "epoch": 3.08876560332871, + "grad_norm": 0.416515367957737, + "learning_rate": 3.828477911731722e-06, + "loss": 0.4082, + "step": 4454 + }, + { + "epoch": 3.089459084604716, + "grad_norm": 0.38225540896970156, + "learning_rate": 3.826124727336082e-06, + "loss": 0.3708, + "step": 4455 + }, + { + "epoch": 3.090152565880721, + "grad_norm": 0.5631861252468322, + "learning_rate": 3.823771818088303e-06, + "loss": 0.3425, + "step": 4456 + }, + { + "epoch": 3.090846047156727, + "grad_norm": 0.388759511851324, + "learning_rate": 3.8214191845398925e-06, + "loss": 0.4166, + "step": 4457 + }, + { + "epoch": 3.0915395284327323, + "grad_norm": 0.3842554999377556, + "learning_rate": 3.8190668272422875e-06, + "loss": 0.375, + "step": 4458 + }, + { + "epoch": 3.092233009708738, + "grad_norm": 0.39135523441734305, + "learning_rate": 3.8167147467468655e-06, + "loss": 0.3473, + "step": 4459 + }, + { + "epoch": 3.0929264909847434, + "grad_norm": 0.3891529251234928, + "learning_rate": 3.814362943604938e-06, + "loss": 0.3962, + "step": 4460 + }, + { + "epoch": 3.093619972260749, + "grad_norm": 0.3915371632914156, + "learning_rate": 3.81201141836775e-06, + "loss": 0.3664, + "step": 4461 + }, + { + "epoch": 3.0943134535367545, + "grad_norm": 0.36919388784572393, + "learning_rate": 3.8096601715864824e-06, + "loss": 0.3342, + "step": 4462 + }, + { + "epoch": 3.0950069348127602, + "grad_norm": 0.38259619053637206, + "learning_rate": 3.8073092038122483e-06, + "loss": 0.4063, + "step": 4463 + }, + { + "epoch": 3.0957004160887656, + "grad_norm": 0.38933793033415215, + "learning_rate": 3.8049585155961e-06, + "loss": 0.3243, + "step": 4464 + }, + { + "epoch": 3.0963938973647713, + "grad_norm": 0.4338739880744865, + "learning_rate": 3.80260810748902e-06, + "loss": 0.3677, + "step": 4465 + }, + { + "epoch": 3.0970873786407767, + "grad_norm": 0.44818598840734036, + "learning_rate": 3.8002579800419276e-06, + "loss": 0.3578, + "step": 4466 + }, + { + "epoch": 3.0977808599167824, + "grad_norm": 0.39606357351676086, + "learning_rate": 3.7979081338056756e-06, + "loss": 0.3842, + "step": 4467 + }, + { + "epoch": 3.0984743411927878, + "grad_norm": 0.43804550380169477, + "learning_rate": 3.795558569331051e-06, + "loss": 0.3739, + "step": 4468 + }, + { + "epoch": 3.0991678224687935, + "grad_norm": 0.38043404835259115, + "learning_rate": 3.7932092871687754e-06, + "loss": 0.3838, + "step": 4469 + }, + { + "epoch": 3.099861303744799, + "grad_norm": 0.41754009678952525, + "learning_rate": 3.7908602878695035e-06, + "loss": 0.4116, + "step": 4470 + }, + { + "epoch": 3.1005547850208046, + "grad_norm": 0.3848029699126805, + "learning_rate": 3.7885115719838215e-06, + "loss": 0.3609, + "step": 4471 + }, + { + "epoch": 3.10124826629681, + "grad_norm": 0.5704725192060731, + "learning_rate": 3.7861631400622544e-06, + "loss": 0.43, + "step": 4472 + }, + { + "epoch": 3.1019417475728157, + "grad_norm": 0.3992635890503639, + "learning_rate": 3.7838149926552565e-06, + "loss": 0.3809, + "step": 4473 + }, + { + "epoch": 3.102635228848821, + "grad_norm": 0.3834299870363859, + "learning_rate": 3.781467130313215e-06, + "loss": 0.3954, + "step": 4474 + }, + { + "epoch": 3.103328710124827, + "grad_norm": 0.38099413579015073, + "learning_rate": 3.7791195535864543e-06, + "loss": 0.3741, + "step": 4475 + }, + { + "epoch": 3.104022191400832, + "grad_norm": 0.4581920847878088, + "learning_rate": 3.7767722630252258e-06, + "loss": 0.376, + "step": 4476 + }, + { + "epoch": 3.104715672676838, + "grad_norm": 0.3818859346766297, + "learning_rate": 3.774425259179722e-06, + "loss": 0.3731, + "step": 4477 + }, + { + "epoch": 3.1054091539528432, + "grad_norm": 0.38971293661134465, + "learning_rate": 3.7720785426000616e-06, + "loss": 0.3745, + "step": 4478 + }, + { + "epoch": 3.106102635228849, + "grad_norm": 0.3741115171117569, + "learning_rate": 3.7697321138362964e-06, + "loss": 0.3521, + "step": 4479 + }, + { + "epoch": 3.1067961165048543, + "grad_norm": 0.427065926538677, + "learning_rate": 3.7673859734384153e-06, + "loss": 0.4011, + "step": 4480 + }, + { + "epoch": 3.10748959778086, + "grad_norm": 0.44365746140787743, + "learning_rate": 3.765040121956335e-06, + "loss": 0.4323, + "step": 4481 + }, + { + "epoch": 3.1081830790568654, + "grad_norm": 0.43525203993773753, + "learning_rate": 3.7626945599399057e-06, + "loss": 0.336, + "step": 4482 + }, + { + "epoch": 3.108876560332871, + "grad_norm": 0.3777284600854801, + "learning_rate": 3.7603492879389093e-06, + "loss": 0.4419, + "step": 4483 + }, + { + "epoch": 3.1095700416088765, + "grad_norm": 0.3762643513558504, + "learning_rate": 3.7580043065030635e-06, + "loss": 0.4031, + "step": 4484 + }, + { + "epoch": 3.1102635228848823, + "grad_norm": 0.3776638373639323, + "learning_rate": 3.755659616182011e-06, + "loss": 0.378, + "step": 4485 + }, + { + "epoch": 3.1109570041608876, + "grad_norm": 0.375558249870944, + "learning_rate": 3.753315217525334e-06, + "loss": 0.3407, + "step": 4486 + }, + { + "epoch": 3.1116504854368934, + "grad_norm": 0.36291641879972486, + "learning_rate": 3.750971111082542e-06, + "loss": 0.3569, + "step": 4487 + }, + { + "epoch": 3.1123439667128987, + "grad_norm": 0.4381425116892885, + "learning_rate": 3.748627297403074e-06, + "loss": 0.3881, + "step": 4488 + }, + { + "epoch": 3.1130374479889045, + "grad_norm": 0.4206945966092455, + "learning_rate": 3.746283777036306e-06, + "loss": 0.3238, + "step": 4489 + }, + { + "epoch": 3.11373092926491, + "grad_norm": 0.36659217003345, + "learning_rate": 3.743940550531541e-06, + "loss": 0.3441, + "step": 4490 + }, + { + "epoch": 3.1144244105409156, + "grad_norm": 0.641827426954368, + "learning_rate": 3.7415976184380125e-06, + "loss": 0.3801, + "step": 4491 + }, + { + "epoch": 3.115117891816921, + "grad_norm": 0.39134077585171323, + "learning_rate": 3.73925498130489e-06, + "loss": 0.387, + "step": 4492 + }, + { + "epoch": 3.1158113730929267, + "grad_norm": 0.38067619193735663, + "learning_rate": 3.7369126396812694e-06, + "loss": 0.3642, + "step": 4493 + }, + { + "epoch": 3.116504854368932, + "grad_norm": 0.38495377604143216, + "learning_rate": 3.7345705941161757e-06, + "loss": 0.3639, + "step": 4494 + }, + { + "epoch": 3.1171983356449378, + "grad_norm": 0.45917577105591284, + "learning_rate": 3.732228845158572e-06, + "loss": 0.4108, + "step": 4495 + }, + { + "epoch": 3.117891816920943, + "grad_norm": 0.6862560129851659, + "learning_rate": 3.729887393357345e-06, + "loss": 0.4102, + "step": 4496 + }, + { + "epoch": 3.118585298196949, + "grad_norm": 0.41394043543534825, + "learning_rate": 3.7275462392613148e-06, + "loss": 0.3654, + "step": 4497 + }, + { + "epoch": 3.119278779472954, + "grad_norm": 0.390747317729945, + "learning_rate": 3.725205383419231e-06, + "loss": 0.362, + "step": 4498 + }, + { + "epoch": 3.11997226074896, + "grad_norm": 0.43877439333836693, + "learning_rate": 3.722864826379772e-06, + "loss": 0.4057, + "step": 4499 + }, + { + "epoch": 3.1206657420249653, + "grad_norm": 0.3773831358177694, + "learning_rate": 3.7205245686915486e-06, + "loss": 0.3507, + "step": 4500 + }, + { + "epoch": 3.121359223300971, + "grad_norm": 0.42652590786524336, + "learning_rate": 3.7181846109031007e-06, + "loss": 0.4007, + "step": 4501 + }, + { + "epoch": 3.1220527045769764, + "grad_norm": 0.4323533630204603, + "learning_rate": 3.715844953562896e-06, + "loss": 0.4064, + "step": 4502 + }, + { + "epoch": 3.122746185852982, + "grad_norm": 0.4509676369289724, + "learning_rate": 3.713505597219332e-06, + "loss": 0.403, + "step": 4503 + }, + { + "epoch": 3.1234396671289875, + "grad_norm": 0.3642261228219249, + "learning_rate": 3.71116654242074e-06, + "loss": 0.3487, + "step": 4504 + }, + { + "epoch": 3.1241331484049932, + "grad_norm": 0.39544002363504477, + "learning_rate": 3.7088277897153768e-06, + "loss": 0.3329, + "step": 4505 + }, + { + "epoch": 3.1248266296809986, + "grad_norm": 0.3979523920049841, + "learning_rate": 3.706489339651429e-06, + "loss": 0.3853, + "step": 4506 + }, + { + "epoch": 3.1255201109570043, + "grad_norm": 0.42283589870748417, + "learning_rate": 3.7041511927770117e-06, + "loss": 0.3716, + "step": 4507 + }, + { + "epoch": 3.1262135922330097, + "grad_norm": 0.38950004656984266, + "learning_rate": 3.7018133496401688e-06, + "loss": 0.2901, + "step": 4508 + }, + { + "epoch": 3.1269070735090154, + "grad_norm": 0.3857349942222745, + "learning_rate": 3.699475810788876e-06, + "loss": 0.3959, + "step": 4509 + }, + { + "epoch": 3.1276005547850207, + "grad_norm": 0.38254306786487735, + "learning_rate": 3.6971385767710345e-06, + "loss": 0.3679, + "step": 4510 + }, + { + "epoch": 3.1282940360610265, + "grad_norm": 0.3699064806571636, + "learning_rate": 3.694801648134474e-06, + "loss": 0.3671, + "step": 4511 + }, + { + "epoch": 3.128987517337032, + "grad_norm": 0.37103069167123504, + "learning_rate": 3.6924650254269545e-06, + "loss": 0.3857, + "step": 4512 + }, + { + "epoch": 3.1296809986130376, + "grad_norm": 0.40369084618412426, + "learning_rate": 3.6901287091961626e-06, + "loss": 0.4098, + "step": 4513 + }, + { + "epoch": 3.130374479889043, + "grad_norm": 0.4151237828786567, + "learning_rate": 3.687792699989716e-06, + "loss": 0.4195, + "step": 4514 + }, + { + "epoch": 3.1310679611650487, + "grad_norm": 0.40708269901780325, + "learning_rate": 3.685456998355158e-06, + "loss": 0.3905, + "step": 4515 + }, + { + "epoch": 3.131761442441054, + "grad_norm": 0.4433310014494769, + "learning_rate": 3.6831216048399576e-06, + "loss": 0.3944, + "step": 4516 + }, + { + "epoch": 3.13245492371706, + "grad_norm": 0.4360622441782407, + "learning_rate": 3.680786519991516e-06, + "loss": 0.3275, + "step": 4517 + }, + { + "epoch": 3.133148404993065, + "grad_norm": 0.37538970177895226, + "learning_rate": 3.678451744357161e-06, + "loss": 0.3918, + "step": 4518 + }, + { + "epoch": 3.133841886269071, + "grad_norm": 0.3703308227912376, + "learning_rate": 3.6761172784841446e-06, + "loss": 0.371, + "step": 4519 + }, + { + "epoch": 3.1345353675450762, + "grad_norm": 0.4020934089664458, + "learning_rate": 3.6737831229196506e-06, + "loss": 0.3864, + "step": 4520 + }, + { + "epoch": 3.135228848821082, + "grad_norm": 0.4086005509435353, + "learning_rate": 3.671449278210787e-06, + "loss": 0.4016, + "step": 4521 + }, + { + "epoch": 3.1359223300970873, + "grad_norm": 0.3673901868730733, + "learning_rate": 3.6691157449045915e-06, + "loss": 0.382, + "step": 4522 + }, + { + "epoch": 3.136615811373093, + "grad_norm": 0.4809131935124871, + "learning_rate": 3.666782523548027e-06, + "loss": 0.3446, + "step": 4523 + }, + { + "epoch": 3.1373092926490984, + "grad_norm": 0.41684086141260485, + "learning_rate": 3.664449614687983e-06, + "loss": 0.3499, + "step": 4524 + }, + { + "epoch": 3.138002773925104, + "grad_norm": 0.4036790077975126, + "learning_rate": 3.6621170188712773e-06, + "loss": 0.3315, + "step": 4525 + }, + { + "epoch": 3.1386962552011095, + "grad_norm": 0.41920618795044234, + "learning_rate": 3.6597847366446524e-06, + "loss": 0.373, + "step": 4526 + }, + { + "epoch": 3.1393897364771153, + "grad_norm": 0.4496728427869807, + "learning_rate": 3.6574527685547802e-06, + "loss": 0.3833, + "step": 4527 + }, + { + "epoch": 3.1400832177531206, + "grad_norm": 0.39037248721873086, + "learning_rate": 3.655121115148254e-06, + "loss": 0.3908, + "step": 4528 + }, + { + "epoch": 3.1407766990291264, + "grad_norm": 0.43033088505115946, + "learning_rate": 3.6527897769716e-06, + "loss": 0.3588, + "step": 4529 + }, + { + "epoch": 3.1414701803051317, + "grad_norm": 0.3882364138862382, + "learning_rate": 3.650458754571262e-06, + "loss": 0.3889, + "step": 4530 + }, + { + "epoch": 3.1421636615811375, + "grad_norm": 0.38693129603089405, + "learning_rate": 3.6481280484936215e-06, + "loss": 0.3548, + "step": 4531 + }, + { + "epoch": 3.142857142857143, + "grad_norm": 0.401522254946418, + "learning_rate": 3.6457976592849753e-06, + "loss": 0.3837, + "step": 4532 + }, + { + "epoch": 3.1435506241331486, + "grad_norm": 0.4144178746518509, + "learning_rate": 3.643467587491549e-06, + "loss": 0.3953, + "step": 4533 + }, + { + "epoch": 3.144244105409154, + "grad_norm": 0.4026627646459669, + "learning_rate": 3.6411378336594966e-06, + "loss": 0.3769, + "step": 4534 + }, + { + "epoch": 3.1449375866851597, + "grad_norm": 0.395956594977255, + "learning_rate": 3.6388083983348948e-06, + "loss": 0.3372, + "step": 4535 + }, + { + "epoch": 3.145631067961165, + "grad_norm": 0.4663168141131961, + "learning_rate": 3.636479282063745e-06, + "loss": 0.3863, + "step": 4536 + }, + { + "epoch": 3.1463245492371708, + "grad_norm": 0.4400011726696039, + "learning_rate": 3.6341504853919778e-06, + "loss": 0.3758, + "step": 4537 + }, + { + "epoch": 3.147018030513176, + "grad_norm": 0.3921494050754728, + "learning_rate": 3.631822008865445e-06, + "loss": 0.3677, + "step": 4538 + }, + { + "epoch": 3.147711511789182, + "grad_norm": 0.4016681240367152, + "learning_rate": 3.6294938530299216e-06, + "loss": 0.3851, + "step": 4539 + }, + { + "epoch": 3.148404993065187, + "grad_norm": 0.35104120813762685, + "learning_rate": 3.6271660184311164e-06, + "loss": 0.3677, + "step": 4540 + }, + { + "epoch": 3.149098474341193, + "grad_norm": 0.3988607916651478, + "learning_rate": 3.624838505614653e-06, + "loss": 0.3922, + "step": 4541 + }, + { + "epoch": 3.1497919556171983, + "grad_norm": 0.43849015745767567, + "learning_rate": 3.6225113151260848e-06, + "loss": 0.4291, + "step": 4542 + }, + { + "epoch": 3.150485436893204, + "grad_norm": 0.39884929207837433, + "learning_rate": 3.6201844475108884e-06, + "loss": 0.3746, + "step": 4543 + }, + { + "epoch": 3.1511789181692094, + "grad_norm": 0.38140417716325825, + "learning_rate": 3.6178579033144635e-06, + "loss": 0.3868, + "step": 4544 + }, + { + "epoch": 3.151872399445215, + "grad_norm": 0.4214282092022257, + "learning_rate": 3.615531683082137e-06, + "loss": 0.3756, + "step": 4545 + }, + { + "epoch": 3.1525658807212205, + "grad_norm": 0.37454025023252646, + "learning_rate": 3.613205787359157e-06, + "loss": 0.3443, + "step": 4546 + }, + { + "epoch": 3.1532593619972262, + "grad_norm": 0.4532746477859641, + "learning_rate": 3.610880216690697e-06, + "loss": 0.3905, + "step": 4547 + }, + { + "epoch": 3.1539528432732316, + "grad_norm": 0.4506339576500096, + "learning_rate": 3.6085549716218517e-06, + "loss": 0.3577, + "step": 4548 + }, + { + "epoch": 3.1546463245492373, + "grad_norm": 0.4472297946605434, + "learning_rate": 3.6062300526976448e-06, + "loss": 0.4038, + "step": 4549 + }, + { + "epoch": 3.1553398058252426, + "grad_norm": 0.36535844144971125, + "learning_rate": 3.6039054604630202e-06, + "loss": 0.3519, + "step": 4550 + }, + { + "epoch": 3.1560332871012484, + "grad_norm": 0.39295369856605805, + "learning_rate": 3.601581195462845e-06, + "loss": 0.3579, + "step": 4551 + }, + { + "epoch": 3.1567267683772537, + "grad_norm": 0.37912670237497886, + "learning_rate": 3.5992572582419094e-06, + "loss": 0.3209, + "step": 4552 + }, + { + "epoch": 3.1574202496532595, + "grad_norm": 0.38743517658601034, + "learning_rate": 3.596933649344927e-06, + "loss": 0.4002, + "step": 4553 + }, + { + "epoch": 3.158113730929265, + "grad_norm": 0.3822231342830769, + "learning_rate": 3.5946103693165367e-06, + "loss": 0.3685, + "step": 4554 + }, + { + "epoch": 3.1588072122052706, + "grad_norm": 0.39259526918622556, + "learning_rate": 3.5922874187012977e-06, + "loss": 0.3876, + "step": 4555 + }, + { + "epoch": 3.159500693481276, + "grad_norm": 0.3523415802257681, + "learning_rate": 3.589964798043691e-06, + "loss": 0.368, + "step": 4556 + }, + { + "epoch": 3.1601941747572817, + "grad_norm": 0.45927927993817286, + "learning_rate": 3.5876425078881245e-06, + "loss": 0.4185, + "step": 4557 + }, + { + "epoch": 3.160887656033287, + "grad_norm": 0.40915345725167057, + "learning_rate": 3.5853205487789245e-06, + "loss": 0.3709, + "step": 4558 + }, + { + "epoch": 3.161581137309293, + "grad_norm": 0.4478400318131345, + "learning_rate": 3.5829989212603445e-06, + "loss": 0.3992, + "step": 4559 + }, + { + "epoch": 3.162274618585298, + "grad_norm": 0.43303004832747666, + "learning_rate": 3.580677625876554e-06, + "loss": 0.3973, + "step": 4560 + }, + { + "epoch": 3.162968099861304, + "grad_norm": 0.36561616440696576, + "learning_rate": 3.578356663171648e-06, + "loss": 0.3916, + "step": 4561 + }, + { + "epoch": 3.163661581137309, + "grad_norm": 0.3671723246093496, + "learning_rate": 3.576036033689645e-06, + "loss": 0.3623, + "step": 4562 + }, + { + "epoch": 3.164355062413315, + "grad_norm": 0.4045849606754154, + "learning_rate": 3.573715737974483e-06, + "loss": 0.4287, + "step": 4563 + }, + { + "epoch": 3.1650485436893203, + "grad_norm": 0.4009710230310148, + "learning_rate": 3.5713957765700224e-06, + "loss": 0.3872, + "step": 4564 + }, + { + "epoch": 3.165742024965326, + "grad_norm": 0.42273243627138596, + "learning_rate": 3.5690761500200445e-06, + "loss": 0.4277, + "step": 4565 + }, + { + "epoch": 3.1664355062413314, + "grad_norm": 0.47947504806208163, + "learning_rate": 3.5667568588682523e-06, + "loss": 0.3627, + "step": 4566 + }, + { + "epoch": 3.167128987517337, + "grad_norm": 0.43195614980791464, + "learning_rate": 3.5644379036582747e-06, + "loss": 0.4079, + "step": 4567 + }, + { + "epoch": 3.1678224687933425, + "grad_norm": 0.4149106884186393, + "learning_rate": 3.5621192849336563e-06, + "loss": 0.3718, + "step": 4568 + }, + { + "epoch": 3.1685159500693483, + "grad_norm": 0.40573620487210077, + "learning_rate": 3.5598010032378614e-06, + "loss": 0.3739, + "step": 4569 + }, + { + "epoch": 3.1692094313453536, + "grad_norm": 0.3873702181727803, + "learning_rate": 3.557483059114283e-06, + "loss": 0.3394, + "step": 4570 + }, + { + "epoch": 3.1699029126213594, + "grad_norm": 0.38556620066222613, + "learning_rate": 3.5551654531062283e-06, + "loss": 0.3736, + "step": 4571 + }, + { + "epoch": 3.1705963938973647, + "grad_norm": 0.4838950478437859, + "learning_rate": 3.5528481857569276e-06, + "loss": 0.4417, + "step": 4572 + }, + { + "epoch": 3.1712898751733705, + "grad_norm": 0.4046176789685047, + "learning_rate": 3.5505312576095295e-06, + "loss": 0.3469, + "step": 4573 + }, + { + "epoch": 3.171983356449376, + "grad_norm": 0.3881344136985285, + "learning_rate": 3.5482146692071084e-06, + "loss": 0.3879, + "step": 4574 + }, + { + "epoch": 3.1726768377253816, + "grad_norm": 0.3861992773489338, + "learning_rate": 3.545898421092653e-06, + "loss": 0.3769, + "step": 4575 + }, + { + "epoch": 3.173370319001387, + "grad_norm": 0.39701912747056467, + "learning_rate": 3.5435825138090785e-06, + "loss": 0.3885, + "step": 4576 + }, + { + "epoch": 3.1740638002773927, + "grad_norm": 0.5519607197176933, + "learning_rate": 3.5412669478992143e-06, + "loss": 0.3682, + "step": 4577 + }, + { + "epoch": 3.174757281553398, + "grad_norm": 0.40347950556372547, + "learning_rate": 3.5389517239058126e-06, + "loss": 0.3676, + "step": 4578 + }, + { + "epoch": 3.1754507628294038, + "grad_norm": 0.3898340025145356, + "learning_rate": 3.5366368423715457e-06, + "loss": 0.3613, + "step": 4579 + }, + { + "epoch": 3.176144244105409, + "grad_norm": 0.4204797053937529, + "learning_rate": 3.534322303839005e-06, + "loss": 0.3872, + "step": 4580 + }, + { + "epoch": 3.176837725381415, + "grad_norm": 0.4243223771050655, + "learning_rate": 3.5320081088507006e-06, + "loss": 0.358, + "step": 4581 + }, + { + "epoch": 3.17753120665742, + "grad_norm": 0.3678015990819988, + "learning_rate": 3.5296942579490645e-06, + "loss": 0.3539, + "step": 4582 + }, + { + "epoch": 3.178224687933426, + "grad_norm": 0.3904807604473883, + "learning_rate": 3.5273807516764456e-06, + "loss": 0.3692, + "step": 4583 + }, + { + "epoch": 3.1789181692094313, + "grad_norm": 0.3869752461794236, + "learning_rate": 3.525067590575112e-06, + "loss": 0.3662, + "step": 4584 + }, + { + "epoch": 3.179611650485437, + "grad_norm": 0.39326699730368225, + "learning_rate": 3.5227547751872548e-06, + "loss": 0.4055, + "step": 4585 + }, + { + "epoch": 3.1803051317614424, + "grad_norm": 0.4099980159577789, + "learning_rate": 3.5204423060549794e-06, + "loss": 0.3886, + "step": 4586 + }, + { + "epoch": 3.180998613037448, + "grad_norm": 0.3549444392541635, + "learning_rate": 3.518130183720312e-06, + "loss": 0.3569, + "step": 4587 + }, + { + "epoch": 3.1816920943134535, + "grad_norm": 0.4175436791370744, + "learning_rate": 3.515818408725198e-06, + "loss": 0.3676, + "step": 4588 + }, + { + "epoch": 3.1823855755894592, + "grad_norm": 0.3585594232941696, + "learning_rate": 3.5135069816115e-06, + "loss": 0.3664, + "step": 4589 + }, + { + "epoch": 3.1830790568654646, + "grad_norm": 0.370043642543432, + "learning_rate": 3.511195902920998e-06, + "loss": 0.3692, + "step": 4590 + }, + { + "epoch": 3.1837725381414703, + "grad_norm": 0.38135068968923674, + "learning_rate": 3.5088851731953956e-06, + "loss": 0.3668, + "step": 4591 + }, + { + "epoch": 3.1844660194174756, + "grad_norm": 0.3907094132964532, + "learning_rate": 3.5065747929763093e-06, + "loss": 0.3366, + "step": 4592 + }, + { + "epoch": 3.1851595006934814, + "grad_norm": 0.4233589705480947, + "learning_rate": 3.5042647628052733e-06, + "loss": 0.3742, + "step": 4593 + }, + { + "epoch": 3.1858529819694867, + "grad_norm": 0.4116355800782619, + "learning_rate": 3.5019550832237458e-06, + "loss": 0.405, + "step": 4594 + }, + { + "epoch": 3.1865464632454925, + "grad_norm": 0.400903550984469, + "learning_rate": 3.4996457547730985e-06, + "loss": 0.3766, + "step": 4595 + }, + { + "epoch": 3.187239944521498, + "grad_norm": 0.39370603520421876, + "learning_rate": 3.49733677799462e-06, + "loss": 0.375, + "step": 4596 + }, + { + "epoch": 3.1879334257975036, + "grad_norm": 0.38725808482091406, + "learning_rate": 3.4950281534295176e-06, + "loss": 0.3668, + "step": 4597 + }, + { + "epoch": 3.188626907073509, + "grad_norm": 0.49294027754251885, + "learning_rate": 3.4927198816189156e-06, + "loss": 0.3109, + "step": 4598 + }, + { + "epoch": 3.1893203883495147, + "grad_norm": 0.4715187484089429, + "learning_rate": 3.4904119631038585e-06, + "loss": 0.4018, + "step": 4599 + }, + { + "epoch": 3.19001386962552, + "grad_norm": 0.4041807094580422, + "learning_rate": 3.488104398425304e-06, + "loss": 0.3645, + "step": 4600 + }, + { + "epoch": 3.190707350901526, + "grad_norm": 0.4142833713595007, + "learning_rate": 3.485797188124127e-06, + "loss": 0.3994, + "step": 4601 + }, + { + "epoch": 3.191400832177531, + "grad_norm": 0.39521559316436344, + "learning_rate": 3.4834903327411253e-06, + "loss": 0.4013, + "step": 4602 + }, + { + "epoch": 3.192094313453537, + "grad_norm": 0.621138806112336, + "learning_rate": 3.4811838328170044e-06, + "loss": 0.3708, + "step": 4603 + }, + { + "epoch": 3.192787794729542, + "grad_norm": 0.4221170334782303, + "learning_rate": 3.4788776888923947e-06, + "loss": 0.3884, + "step": 4604 + }, + { + "epoch": 3.193481276005548, + "grad_norm": 0.38681617623347475, + "learning_rate": 3.4765719015078385e-06, + "loss": 0.3368, + "step": 4605 + }, + { + "epoch": 3.1941747572815533, + "grad_norm": 0.41633729932224134, + "learning_rate": 3.4742664712037944e-06, + "loss": 0.3231, + "step": 4606 + }, + { + "epoch": 3.194868238557559, + "grad_norm": 0.3604596676437244, + "learning_rate": 3.47196139852064e-06, + "loss": 0.3383, + "step": 4607 + }, + { + "epoch": 3.1955617198335644, + "grad_norm": 0.42882658298637893, + "learning_rate": 3.469656683998668e-06, + "loss": 0.3845, + "step": 4608 + }, + { + "epoch": 3.19625520110957, + "grad_norm": 0.3828207706645611, + "learning_rate": 3.4673523281780856e-06, + "loss": 0.3577, + "step": 4609 + }, + { + "epoch": 3.1969486823855755, + "grad_norm": 0.35462348082915773, + "learning_rate": 3.4650483315990157e-06, + "loss": 0.3643, + "step": 4610 + }, + { + "epoch": 3.1976421636615813, + "grad_norm": 0.36150254945936683, + "learning_rate": 3.4627446948015007e-06, + "loss": 0.3662, + "step": 4611 + }, + { + "epoch": 3.1983356449375866, + "grad_norm": 0.3937477429821671, + "learning_rate": 3.4604414183254974e-06, + "loss": 0.379, + "step": 4612 + }, + { + "epoch": 3.1990291262135924, + "grad_norm": 0.3694025345050062, + "learning_rate": 3.458138502710876e-06, + "loss": 0.3601, + "step": 4613 + }, + { + "epoch": 3.1997226074895977, + "grad_norm": 0.3885387581009498, + "learning_rate": 3.4558359484974226e-06, + "loss": 0.3689, + "step": 4614 + }, + { + "epoch": 3.2004160887656035, + "grad_norm": 0.4140533461404799, + "learning_rate": 3.4535337562248382e-06, + "loss": 0.4066, + "step": 4615 + }, + { + "epoch": 3.201109570041609, + "grad_norm": 0.6309648393434568, + "learning_rate": 3.451231926432742e-06, + "loss": 0.3584, + "step": 4616 + }, + { + "epoch": 3.2018030513176146, + "grad_norm": 0.4944290627120866, + "learning_rate": 3.4489304596606664e-06, + "loss": 0.3371, + "step": 4617 + }, + { + "epoch": 3.20249653259362, + "grad_norm": 0.4556730862357426, + "learning_rate": 3.4466293564480562e-06, + "loss": 0.4076, + "step": 4618 + }, + { + "epoch": 3.2031900138696257, + "grad_norm": 0.38401356145804477, + "learning_rate": 3.4443286173342737e-06, + "loss": 0.3562, + "step": 4619 + }, + { + "epoch": 3.203883495145631, + "grad_norm": 0.3727558156754104, + "learning_rate": 3.4420282428585988e-06, + "loss": 0.3721, + "step": 4620 + }, + { + "epoch": 3.2045769764216367, + "grad_norm": 0.42192983817694546, + "learning_rate": 3.4397282335602205e-06, + "loss": 0.3593, + "step": 4621 + }, + { + "epoch": 3.205270457697642, + "grad_norm": 0.4215612665330896, + "learning_rate": 3.4374285899782444e-06, + "loss": 0.3758, + "step": 4622 + }, + { + "epoch": 3.205963938973648, + "grad_norm": 0.38203795863831636, + "learning_rate": 3.435129312651688e-06, + "loss": 0.3806, + "step": 4623 + }, + { + "epoch": 3.206657420249653, + "grad_norm": 0.512997472099785, + "learning_rate": 3.4328304021194905e-06, + "loss": 0.3708, + "step": 4624 + }, + { + "epoch": 3.207350901525659, + "grad_norm": 0.38846156156263434, + "learning_rate": 3.430531858920495e-06, + "loss": 0.3862, + "step": 4625 + }, + { + "epoch": 3.2080443828016643, + "grad_norm": 0.3709959064371235, + "learning_rate": 3.4282336835934647e-06, + "loss": 0.3754, + "step": 4626 + }, + { + "epoch": 3.20873786407767, + "grad_norm": 0.37436449041591935, + "learning_rate": 3.425935876677077e-06, + "loss": 0.3519, + "step": 4627 + }, + { + "epoch": 3.2094313453536754, + "grad_norm": 0.43123123274058006, + "learning_rate": 3.4236384387099174e-06, + "loss": 0.3806, + "step": 4628 + }, + { + "epoch": 3.210124826629681, + "grad_norm": 0.38365617875949104, + "learning_rate": 3.421341370230493e-06, + "loss": 0.39, + "step": 4629 + }, + { + "epoch": 3.2108183079056865, + "grad_norm": 0.4015275316706064, + "learning_rate": 3.4190446717772185e-06, + "loss": 0.3946, + "step": 4630 + }, + { + "epoch": 3.2115117891816922, + "grad_norm": 0.3760587210598077, + "learning_rate": 3.4167483438884223e-06, + "loss": 0.3467, + "step": 4631 + }, + { + "epoch": 3.2122052704576975, + "grad_norm": 0.43362930010163175, + "learning_rate": 3.4144523871023494e-06, + "loss": 0.411, + "step": 4632 + }, + { + "epoch": 3.2128987517337033, + "grad_norm": 0.38667502702388695, + "learning_rate": 3.4121568019571528e-06, + "loss": 0.3636, + "step": 4633 + }, + { + "epoch": 3.2135922330097086, + "grad_norm": 0.4319684190430007, + "learning_rate": 3.4098615889909025e-06, + "loss": 0.3482, + "step": 4634 + }, + { + "epoch": 3.2142857142857144, + "grad_norm": 0.43155431710589437, + "learning_rate": 3.4075667487415785e-06, + "loss": 0.4027, + "step": 4635 + }, + { + "epoch": 3.2149791955617197, + "grad_norm": 0.45202120508256316, + "learning_rate": 3.4052722817470767e-06, + "loss": 0.4174, + "step": 4636 + }, + { + "epoch": 3.2156726768377255, + "grad_norm": 0.3647606637363773, + "learning_rate": 3.4029781885452007e-06, + "loss": 0.3712, + "step": 4637 + }, + { + "epoch": 3.216366158113731, + "grad_norm": 0.40252336097090957, + "learning_rate": 3.400684469673673e-06, + "loss": 0.4076, + "step": 4638 + }, + { + "epoch": 3.2170596393897366, + "grad_norm": 0.41251079528321616, + "learning_rate": 3.398391125670123e-06, + "loss": 0.3832, + "step": 4639 + }, + { + "epoch": 3.217753120665742, + "grad_norm": 0.433025357876695, + "learning_rate": 3.3960981570720918e-06, + "loss": 0.3585, + "step": 4640 + }, + { + "epoch": 3.2184466019417477, + "grad_norm": 0.3987377400454232, + "learning_rate": 3.3938055644170387e-06, + "loss": 0.3563, + "step": 4641 + }, + { + "epoch": 3.219140083217753, + "grad_norm": 0.3724300335347595, + "learning_rate": 3.391513348242328e-06, + "loss": 0.3402, + "step": 4642 + }, + { + "epoch": 3.219833564493759, + "grad_norm": 0.4322215115821083, + "learning_rate": 3.3892215090852387e-06, + "loss": 0.3724, + "step": 4643 + }, + { + "epoch": 3.220527045769764, + "grad_norm": 0.3985720750929115, + "learning_rate": 3.3869300474829625e-06, + "loss": 0.4023, + "step": 4644 + }, + { + "epoch": 3.22122052704577, + "grad_norm": 0.3978605034531885, + "learning_rate": 3.3846389639726007e-06, + "loss": 0.398, + "step": 4645 + }, + { + "epoch": 3.221914008321775, + "grad_norm": 0.4405848916175234, + "learning_rate": 3.382348259091165e-06, + "loss": 0.3993, + "step": 4646 + }, + { + "epoch": 3.222607489597781, + "grad_norm": 0.39587299525288566, + "learning_rate": 3.380057933375584e-06, + "loss": 0.4011, + "step": 4647 + }, + { + "epoch": 3.2233009708737863, + "grad_norm": 0.4420675634961044, + "learning_rate": 3.37776798736269e-06, + "loss": 0.36, + "step": 4648 + }, + { + "epoch": 3.223994452149792, + "grad_norm": 0.6828239560594047, + "learning_rate": 3.375478421589232e-06, + "loss": 0.351, + "step": 4649 + }, + { + "epoch": 3.2246879334257974, + "grad_norm": 0.3933581398346004, + "learning_rate": 3.373189236591867e-06, + "loss": 0.3705, + "step": 4650 + }, + { + "epoch": 3.225381414701803, + "grad_norm": 0.40404088842264474, + "learning_rate": 3.3709004329071613e-06, + "loss": 0.374, + "step": 4651 + }, + { + "epoch": 3.2260748959778085, + "grad_norm": 0.3742425957208491, + "learning_rate": 3.368612011071597e-06, + "loss": 0.3744, + "step": 4652 + }, + { + "epoch": 3.2267683772538143, + "grad_norm": 0.3593800806284897, + "learning_rate": 3.366323971621562e-06, + "loss": 0.3139, + "step": 4653 + }, + { + "epoch": 3.2274618585298196, + "grad_norm": 0.37775553551268687, + "learning_rate": 3.3640363150933574e-06, + "loss": 0.3681, + "step": 4654 + }, + { + "epoch": 3.2281553398058254, + "grad_norm": 0.4205893592818195, + "learning_rate": 3.361749042023189e-06, + "loss": 0.3747, + "step": 4655 + }, + { + "epoch": 3.2288488210818307, + "grad_norm": 0.39380798785953863, + "learning_rate": 3.359462152947182e-06, + "loss": 0.3979, + "step": 4656 + }, + { + "epoch": 3.2295423023578365, + "grad_norm": 0.4170226723703579, + "learning_rate": 3.357175648401366e-06, + "loss": 0.4508, + "step": 4657 + }, + { + "epoch": 3.230235783633842, + "grad_norm": 0.4188910446555926, + "learning_rate": 3.3548895289216802e-06, + "loss": 0.3853, + "step": 4658 + }, + { + "epoch": 3.2309292649098476, + "grad_norm": 0.3963257745135273, + "learning_rate": 3.3526037950439748e-06, + "loss": 0.3628, + "step": 4659 + }, + { + "epoch": 3.231622746185853, + "grad_norm": 0.3858475732876974, + "learning_rate": 3.3503184473040074e-06, + "loss": 0.3611, + "step": 4660 + }, + { + "epoch": 3.2323162274618586, + "grad_norm": 0.49357399049790474, + "learning_rate": 3.3480334862374484e-06, + "loss": 0.4091, + "step": 4661 + }, + { + "epoch": 3.233009708737864, + "grad_norm": 0.3614221256334901, + "learning_rate": 3.345748912379878e-06, + "loss": 0.3545, + "step": 4662 + }, + { + "epoch": 3.2337031900138697, + "grad_norm": 0.376766513471375, + "learning_rate": 3.34346472626678e-06, + "loss": 0.3884, + "step": 4663 + }, + { + "epoch": 3.234396671289875, + "grad_norm": 0.421475352248187, + "learning_rate": 3.3411809284335527e-06, + "loss": 0.3468, + "step": 4664 + }, + { + "epoch": 3.235090152565881, + "grad_norm": 0.4049583045952764, + "learning_rate": 3.338897519415502e-06, + "loss": 0.3711, + "step": 4665 + }, + { + "epoch": 3.235783633841886, + "grad_norm": 0.4096439404025271, + "learning_rate": 3.336614499747844e-06, + "loss": 0.3842, + "step": 4666 + }, + { + "epoch": 3.236477115117892, + "grad_norm": 0.39925744089105675, + "learning_rate": 3.3343318699657e-06, + "loss": 0.382, + "step": 4667 + }, + { + "epoch": 3.2371705963938973, + "grad_norm": 0.40684812844166546, + "learning_rate": 3.3320496306041016e-06, + "loss": 0.3792, + "step": 4668 + }, + { + "epoch": 3.237864077669903, + "grad_norm": 0.38669919153425775, + "learning_rate": 3.329767782197991e-06, + "loss": 0.3842, + "step": 4669 + }, + { + "epoch": 3.2385575589459084, + "grad_norm": 0.359739471700017, + "learning_rate": 3.3274863252822155e-06, + "loss": 0.3692, + "step": 4670 + }, + { + "epoch": 3.239251040221914, + "grad_norm": 0.3821369711369388, + "learning_rate": 3.325205260391532e-06, + "loss": 0.3735, + "step": 4671 + }, + { + "epoch": 3.2399445214979194, + "grad_norm": 0.39425492535221834, + "learning_rate": 3.3229245880606063e-06, + "loss": 0.3463, + "step": 4672 + }, + { + "epoch": 3.240638002773925, + "grad_norm": 0.4021031385456516, + "learning_rate": 3.32064430882401e-06, + "loss": 0.3319, + "step": 4673 + }, + { + "epoch": 3.2413314840499305, + "grad_norm": 0.4140595911675711, + "learning_rate": 3.3183644232162264e-06, + "loss": 0.3751, + "step": 4674 + }, + { + "epoch": 3.2420249653259363, + "grad_norm": 0.7733574661774831, + "learning_rate": 3.3160849317716436e-06, + "loss": 0.3804, + "step": 4675 + }, + { + "epoch": 3.2427184466019416, + "grad_norm": 0.6079882560885275, + "learning_rate": 3.313805835024556e-06, + "loss": 0.3749, + "step": 4676 + }, + { + "epoch": 3.2434119278779474, + "grad_norm": 0.3599516991109779, + "learning_rate": 3.31152713350917e-06, + "loss": 0.3632, + "step": 4677 + }, + { + "epoch": 3.2441054091539527, + "grad_norm": 0.3856308005868276, + "learning_rate": 3.3092488277595956e-06, + "loss": 0.3901, + "step": 4678 + }, + { + "epoch": 3.2447988904299585, + "grad_norm": 0.38346741669811346, + "learning_rate": 3.306970918309851e-06, + "loss": 0.3877, + "step": 4679 + }, + { + "epoch": 3.245492371705964, + "grad_norm": 0.41299761280186253, + "learning_rate": 3.3046934056938597e-06, + "loss": 0.4434, + "step": 4680 + }, + { + "epoch": 3.2461858529819696, + "grad_norm": 0.37844739613202444, + "learning_rate": 3.3024162904454584e-06, + "loss": 0.4111, + "step": 4681 + }, + { + "epoch": 3.246879334257975, + "grad_norm": 0.40472183159992986, + "learning_rate": 3.300139573098381e-06, + "loss": 0.3917, + "step": 4682 + }, + { + "epoch": 3.2475728155339807, + "grad_norm": 0.4343151573089637, + "learning_rate": 3.2978632541862788e-06, + "loss": 0.3872, + "step": 4683 + }, + { + "epoch": 3.248266296809986, + "grad_norm": 0.386539834709046, + "learning_rate": 3.295587334242703e-06, + "loss": 0.3422, + "step": 4684 + }, + { + "epoch": 3.248959778085992, + "grad_norm": 0.41752111355890925, + "learning_rate": 3.293311813801111e-06, + "loss": 0.4358, + "step": 4685 + }, + { + "epoch": 3.249653259361997, + "grad_norm": 0.5154575876616192, + "learning_rate": 3.29103669339487e-06, + "loss": 0.3778, + "step": 4686 + }, + { + "epoch": 3.250346740638003, + "grad_norm": 0.3897439344895969, + "learning_rate": 3.2887619735572517e-06, + "loss": 0.4008, + "step": 4687 + }, + { + "epoch": 3.251040221914008, + "grad_norm": 0.37212187353874293, + "learning_rate": 3.286487654821432e-06, + "loss": 0.3511, + "step": 4688 + }, + { + "epoch": 3.251733703190014, + "grad_norm": 0.38096950234035915, + "learning_rate": 3.2842137377204977e-06, + "loss": 0.3933, + "step": 4689 + }, + { + "epoch": 3.2524271844660193, + "grad_norm": 0.45556083132043235, + "learning_rate": 3.2819402227874364e-06, + "loss": 0.3588, + "step": 4690 + }, + { + "epoch": 3.253120665742025, + "grad_norm": 0.46434375002141354, + "learning_rate": 3.2796671105551425e-06, + "loss": 0.3659, + "step": 4691 + }, + { + "epoch": 3.2538141470180304, + "grad_norm": 0.71621304486904, + "learning_rate": 3.2773944015564203e-06, + "loss": 0.3612, + "step": 4692 + }, + { + "epoch": 3.254507628294036, + "grad_norm": 0.46015280690996024, + "learning_rate": 3.275122096323974e-06, + "loss": 0.3706, + "step": 4693 + }, + { + "epoch": 3.2552011095700415, + "grad_norm": 0.39217217473107296, + "learning_rate": 3.272850195390417e-06, + "loss": 0.3564, + "step": 4694 + }, + { + "epoch": 3.2558945908460473, + "grad_norm": 0.3757902600375741, + "learning_rate": 3.2705786992882656e-06, + "loss": 0.3831, + "step": 4695 + }, + { + "epoch": 3.2565880721220526, + "grad_norm": 0.44250048647734747, + "learning_rate": 3.268307608549941e-06, + "loss": 0.3645, + "step": 4696 + }, + { + "epoch": 3.2572815533980584, + "grad_norm": 0.43052569761464365, + "learning_rate": 3.2660369237077726e-06, + "loss": 0.4026, + "step": 4697 + }, + { + "epoch": 3.2579750346740637, + "grad_norm": 0.38384950579415383, + "learning_rate": 3.2637666452939908e-06, + "loss": 0.3712, + "step": 4698 + }, + { + "epoch": 3.2586685159500695, + "grad_norm": 0.40359480650687335, + "learning_rate": 3.2614967738407332e-06, + "loss": 0.3735, + "step": 4699 + }, + { + "epoch": 3.259361997226075, + "grad_norm": 0.406477739696617, + "learning_rate": 3.2592273098800396e-06, + "loss": 0.3656, + "step": 4700 + }, + { + "epoch": 3.2600554785020806, + "grad_norm": 0.47092113457420953, + "learning_rate": 3.2569582539438577e-06, + "loss": 0.3872, + "step": 4701 + }, + { + "epoch": 3.260748959778086, + "grad_norm": 0.42953117680258723, + "learning_rate": 3.254689606564039e-06, + "loss": 0.3771, + "step": 4702 + }, + { + "epoch": 3.2614424410540916, + "grad_norm": 0.37939592440727415, + "learning_rate": 3.252421368272336e-06, + "loss": 0.3772, + "step": 4703 + }, + { + "epoch": 3.262135922330097, + "grad_norm": 0.4047359199208624, + "learning_rate": 3.250153539600407e-06, + "loss": 0.3458, + "step": 4704 + }, + { + "epoch": 3.2628294036061027, + "grad_norm": 0.40822260083803935, + "learning_rate": 3.2478861210798153e-06, + "loss": 0.4031, + "step": 4705 + }, + { + "epoch": 3.263522884882108, + "grad_norm": 0.3976685892114422, + "learning_rate": 3.245619113242028e-06, + "loss": 0.3787, + "step": 4706 + }, + { + "epoch": 3.264216366158114, + "grad_norm": 0.39611973753983676, + "learning_rate": 3.243352516618415e-06, + "loss": 0.3805, + "step": 4707 + }, + { + "epoch": 3.264909847434119, + "grad_norm": 0.376861464491667, + "learning_rate": 3.2410863317402486e-06, + "loss": 0.343, + "step": 4708 + }, + { + "epoch": 3.265603328710125, + "grad_norm": 0.43254684201661425, + "learning_rate": 3.238820559138707e-06, + "loss": 0.3848, + "step": 4709 + }, + { + "epoch": 3.2662968099861303, + "grad_norm": 0.37562919250073, + "learning_rate": 3.236555199344872e-06, + "loss": 0.3735, + "step": 4710 + }, + { + "epoch": 3.266990291262136, + "grad_norm": 0.4552139783968729, + "learning_rate": 3.234290252889728e-06, + "loss": 0.3882, + "step": 4711 + }, + { + "epoch": 3.2676837725381414, + "grad_norm": 0.4104471229451528, + "learning_rate": 3.2320257203041605e-06, + "loss": 0.3632, + "step": 4712 + }, + { + "epoch": 3.268377253814147, + "grad_norm": 0.4535476664720102, + "learning_rate": 3.229761602118958e-06, + "loss": 0.3797, + "step": 4713 + }, + { + "epoch": 3.2690707350901524, + "grad_norm": 0.3961736462119493, + "learning_rate": 3.2274978988648175e-06, + "loss": 0.3479, + "step": 4714 + }, + { + "epoch": 3.269764216366158, + "grad_norm": 0.3773652200780356, + "learning_rate": 3.225234611072332e-06, + "loss": 0.3328, + "step": 4715 + }, + { + "epoch": 3.2704576976421635, + "grad_norm": 0.3630660874836001, + "learning_rate": 3.2229717392719996e-06, + "loss": 0.3514, + "step": 4716 + }, + { + "epoch": 3.2711511789181693, + "grad_norm": 0.4070481699030177, + "learning_rate": 3.220709283994222e-06, + "loss": 0.3853, + "step": 4717 + }, + { + "epoch": 3.2718446601941746, + "grad_norm": 0.39008883733768285, + "learning_rate": 3.2184472457693005e-06, + "loss": 0.3512, + "step": 4718 + }, + { + "epoch": 3.2725381414701804, + "grad_norm": 0.4063513655394903, + "learning_rate": 3.216185625127444e-06, + "loss": 0.3846, + "step": 4719 + }, + { + "epoch": 3.2732316227461857, + "grad_norm": 0.38393336996444766, + "learning_rate": 3.2139244225987576e-06, + "loss": 0.3675, + "step": 4720 + }, + { + "epoch": 3.2739251040221915, + "grad_norm": 0.42296411593548733, + "learning_rate": 3.2116636387132506e-06, + "loss": 0.3863, + "step": 4721 + }, + { + "epoch": 3.274618585298197, + "grad_norm": 0.3678992924450587, + "learning_rate": 3.209403274000835e-06, + "loss": 0.4057, + "step": 4722 + }, + { + "epoch": 3.2753120665742026, + "grad_norm": 0.41732537123782387, + "learning_rate": 3.2071433289913252e-06, + "loss": 0.3494, + "step": 4723 + }, + { + "epoch": 3.276005547850208, + "grad_norm": 0.4722948230226156, + "learning_rate": 3.2048838042144337e-06, + "loss": 0.3582, + "step": 4724 + }, + { + "epoch": 3.2766990291262137, + "grad_norm": 0.3742099048030128, + "learning_rate": 3.202624700199777e-06, + "loss": 0.3717, + "step": 4725 + }, + { + "epoch": 3.277392510402219, + "grad_norm": 0.3793621842235678, + "learning_rate": 3.2003660174768746e-06, + "loss": 0.3394, + "step": 4726 + }, + { + "epoch": 3.278085991678225, + "grad_norm": 0.4103144771838663, + "learning_rate": 3.198107756575142e-06, + "loss": 0.3663, + "step": 4727 + }, + { + "epoch": 3.27877947295423, + "grad_norm": 0.4395224447677697, + "learning_rate": 3.195849918023903e-06, + "loss": 0.3768, + "step": 4728 + }, + { + "epoch": 3.279472954230236, + "grad_norm": 0.38681679074906994, + "learning_rate": 3.1935925023523775e-06, + "loss": 0.38, + "step": 4729 + }, + { + "epoch": 3.280166435506241, + "grad_norm": 0.3828074876963442, + "learning_rate": 3.191335510089685e-06, + "loss": 0.3143, + "step": 4730 + }, + { + "epoch": 3.280859916782247, + "grad_norm": 0.4169252349768072, + "learning_rate": 3.1890789417648515e-06, + "loss": 0.3636, + "step": 4731 + }, + { + "epoch": 3.2815533980582523, + "grad_norm": 0.40541723504443294, + "learning_rate": 3.1868227979067985e-06, + "loss": 0.3896, + "step": 4732 + }, + { + "epoch": 3.282246879334258, + "grad_norm": 0.38123370082559205, + "learning_rate": 3.1845670790443495e-06, + "loss": 0.3865, + "step": 4733 + }, + { + "epoch": 3.2829403606102634, + "grad_norm": 0.40263127376607377, + "learning_rate": 3.1823117857062297e-06, + "loss": 0.4081, + "step": 4734 + }, + { + "epoch": 3.283633841886269, + "grad_norm": 0.39593720799173643, + "learning_rate": 3.1800569184210627e-06, + "loss": 0.3935, + "step": 4735 + }, + { + "epoch": 3.2843273231622745, + "grad_norm": 0.4232411873435769, + "learning_rate": 3.177802477717372e-06, + "loss": 0.3895, + "step": 4736 + }, + { + "epoch": 3.2850208044382803, + "grad_norm": 0.460907159586181, + "learning_rate": 3.1755484641235855e-06, + "loss": 0.403, + "step": 4737 + }, + { + "epoch": 3.2857142857142856, + "grad_norm": 0.4113030507654702, + "learning_rate": 3.173294878168025e-06, + "loss": 0.359, + "step": 4738 + }, + { + "epoch": 3.2864077669902914, + "grad_norm": 0.3928490884002706, + "learning_rate": 3.1710417203789155e-06, + "loss": 0.395, + "step": 4739 + }, + { + "epoch": 3.2871012482662967, + "grad_norm": 0.40122412846447003, + "learning_rate": 3.1687889912843816e-06, + "loss": 0.4225, + "step": 4740 + }, + { + "epoch": 3.2877947295423025, + "grad_norm": 0.3930969110948373, + "learning_rate": 3.1665366914124452e-06, + "loss": 0.3925, + "step": 4741 + }, + { + "epoch": 3.2884882108183078, + "grad_norm": 0.4605346063562092, + "learning_rate": 3.1642848212910297e-06, + "loss": 0.3973, + "step": 4742 + }, + { + "epoch": 3.2891816920943135, + "grad_norm": 0.406540718644168, + "learning_rate": 3.1620333814479583e-06, + "loss": 0.3466, + "step": 4743 + }, + { + "epoch": 3.289875173370319, + "grad_norm": 0.38605095746544155, + "learning_rate": 3.15978237241095e-06, + "loss": 0.3717, + "step": 4744 + }, + { + "epoch": 3.2905686546463246, + "grad_norm": 0.4349248871930986, + "learning_rate": 3.157531794707625e-06, + "loss": 0.4058, + "step": 4745 + }, + { + "epoch": 3.29126213592233, + "grad_norm": 0.42455338847461416, + "learning_rate": 3.1552816488655042e-06, + "loss": 0.3972, + "step": 4746 + }, + { + "epoch": 3.2919556171983357, + "grad_norm": 0.42332846630236565, + "learning_rate": 3.1530319354120058e-06, + "loss": 0.3578, + "step": 4747 + }, + { + "epoch": 3.292649098474341, + "grad_norm": 0.39867405952893936, + "learning_rate": 3.150782654874446e-06, + "loss": 0.3941, + "step": 4748 + }, + { + "epoch": 3.293342579750347, + "grad_norm": 0.4216220441924548, + "learning_rate": 3.148533807780038e-06, + "loss": 0.3792, + "step": 4749 + }, + { + "epoch": 3.294036061026352, + "grad_norm": 0.33972172852554966, + "learning_rate": 3.146285394655896e-06, + "loss": 0.3702, + "step": 4750 + }, + { + "epoch": 3.294729542302358, + "grad_norm": 0.38013695173073153, + "learning_rate": 3.144037416029034e-06, + "loss": 0.3621, + "step": 4751 + }, + { + "epoch": 3.2954230235783633, + "grad_norm": 0.6888525340981343, + "learning_rate": 3.1417898724263598e-06, + "loss": 0.4257, + "step": 4752 + }, + { + "epoch": 3.296116504854369, + "grad_norm": 0.3772052478915104, + "learning_rate": 3.1395427643746802e-06, + "loss": 0.3725, + "step": 4753 + }, + { + "epoch": 3.2968099861303743, + "grad_norm": 0.510334734999082, + "learning_rate": 3.1372960924007027e-06, + "loss": 0.378, + "step": 4754 + }, + { + "epoch": 3.29750346740638, + "grad_norm": 0.4193102882342751, + "learning_rate": 3.135049857031031e-06, + "loss": 0.4035, + "step": 4755 + }, + { + "epoch": 3.2981969486823854, + "grad_norm": 0.41504374375765757, + "learning_rate": 3.1328040587921672e-06, + "loss": 0.3601, + "step": 4756 + }, + { + "epoch": 3.298890429958391, + "grad_norm": 0.46961796996100313, + "learning_rate": 3.1305586982105097e-06, + "loss": 0.3694, + "step": 4757 + }, + { + "epoch": 3.2995839112343965, + "grad_norm": 0.38038206847740236, + "learning_rate": 3.1283137758123523e-06, + "loss": 0.3833, + "step": 4758 + }, + { + "epoch": 3.3002773925104023, + "grad_norm": 0.4153138935633195, + "learning_rate": 3.1260692921238917e-06, + "loss": 0.3742, + "step": 4759 + }, + { + "epoch": 3.3009708737864076, + "grad_norm": 0.39352053174162527, + "learning_rate": 3.123825247671217e-06, + "loss": 0.3966, + "step": 4760 + }, + { + "epoch": 3.3016643550624134, + "grad_norm": 0.39420314251902044, + "learning_rate": 3.1215816429803174e-06, + "loss": 0.408, + "step": 4761 + }, + { + "epoch": 3.3023578363384187, + "grad_norm": 0.4174036995541163, + "learning_rate": 3.1193384785770755e-06, + "loss": 0.3674, + "step": 4762 + }, + { + "epoch": 3.3030513176144245, + "grad_norm": 0.501637802834254, + "learning_rate": 3.1170957549872718e-06, + "loss": 0.3761, + "step": 4763 + }, + { + "epoch": 3.30374479889043, + "grad_norm": 0.4786328205287392, + "learning_rate": 3.1148534727365894e-06, + "loss": 0.3821, + "step": 4764 + }, + { + "epoch": 3.3044382801664356, + "grad_norm": 0.3686093336736551, + "learning_rate": 3.1126116323505996e-06, + "loss": 0.3975, + "step": 4765 + }, + { + "epoch": 3.305131761442441, + "grad_norm": 0.37392921484159614, + "learning_rate": 3.110370234354773e-06, + "loss": 0.3727, + "step": 4766 + }, + { + "epoch": 3.3058252427184467, + "grad_norm": 0.5261589475294723, + "learning_rate": 3.1081292792744793e-06, + "loss": 0.3544, + "step": 4767 + }, + { + "epoch": 3.306518723994452, + "grad_norm": 0.40050830384526626, + "learning_rate": 3.1058887676349814e-06, + "loss": 0.4168, + "step": 4768 + }, + { + "epoch": 3.307212205270458, + "grad_norm": 0.39231050690104, + "learning_rate": 3.103648699961438e-06, + "loss": 0.3495, + "step": 4769 + }, + { + "epoch": 3.307905686546463, + "grad_norm": 0.37504240401756256, + "learning_rate": 3.101409076778904e-06, + "loss": 0.342, + "step": 4770 + }, + { + "epoch": 3.308599167822469, + "grad_norm": 0.3837277033299974, + "learning_rate": 3.0991698986123343e-06, + "loss": 0.3919, + "step": 4771 + }, + { + "epoch": 3.309292649098474, + "grad_norm": 0.3643823548751013, + "learning_rate": 3.096931165986571e-06, + "loss": 0.3629, + "step": 4772 + }, + { + "epoch": 3.30998613037448, + "grad_norm": 0.3665139520803244, + "learning_rate": 3.094692879426362e-06, + "loss": 0.3798, + "step": 4773 + }, + { + "epoch": 3.3106796116504853, + "grad_norm": 0.368599050542166, + "learning_rate": 3.0924550394563433e-06, + "loss": 0.3737, + "step": 4774 + }, + { + "epoch": 3.311373092926491, + "grad_norm": 0.3744984842661619, + "learning_rate": 3.090217646601047e-06, + "loss": 0.4023, + "step": 4775 + }, + { + "epoch": 3.3120665742024964, + "grad_norm": 0.7820708735439355, + "learning_rate": 3.0879807013849037e-06, + "loss": 0.3911, + "step": 4776 + }, + { + "epoch": 3.312760055478502, + "grad_norm": 0.37775588113662956, + "learning_rate": 3.085744204332237e-06, + "loss": 0.3765, + "step": 4777 + }, + { + "epoch": 3.3134535367545075, + "grad_norm": 0.3806866399607705, + "learning_rate": 3.083508155967264e-06, + "loss": 0.3668, + "step": 4778 + }, + { + "epoch": 3.3141470180305133, + "grad_norm": 0.3874645731776387, + "learning_rate": 3.081272556814101e-06, + "loss": 0.4059, + "step": 4779 + }, + { + "epoch": 3.3148404993065186, + "grad_norm": 0.3763347121067024, + "learning_rate": 3.0790374073967523e-06, + "loss": 0.3739, + "step": 4780 + }, + { + "epoch": 3.3155339805825244, + "grad_norm": 0.4099787943029592, + "learning_rate": 3.0768027082391246e-06, + "loss": 0.3769, + "step": 4781 + }, + { + "epoch": 3.3162274618585297, + "grad_norm": 0.43572291503787564, + "learning_rate": 3.074568459865014e-06, + "loss": 0.3709, + "step": 4782 + }, + { + "epoch": 3.3169209431345354, + "grad_norm": 0.435155025887218, + "learning_rate": 3.07233466279811e-06, + "loss": 0.3549, + "step": 4783 + }, + { + "epoch": 3.3176144244105408, + "grad_norm": 0.3994661726058533, + "learning_rate": 3.070101317562002e-06, + "loss": 0.4443, + "step": 4784 + }, + { + "epoch": 3.3183079056865465, + "grad_norm": 0.44061617332841746, + "learning_rate": 3.0678684246801684e-06, + "loss": 0.4357, + "step": 4785 + }, + { + "epoch": 3.319001386962552, + "grad_norm": 0.4091944474569298, + "learning_rate": 3.065635984675982e-06, + "loss": 0.3866, + "step": 4786 + }, + { + "epoch": 3.3196948682385576, + "grad_norm": 0.3735030393703658, + "learning_rate": 3.0634039980727115e-06, + "loss": 0.3694, + "step": 4787 + }, + { + "epoch": 3.320388349514563, + "grad_norm": 0.3981062550907113, + "learning_rate": 3.0611724653935184e-06, + "loss": 0.409, + "step": 4788 + }, + { + "epoch": 3.3210818307905687, + "grad_norm": 0.37682629797510875, + "learning_rate": 3.058941387161456e-06, + "loss": 0.3722, + "step": 4789 + }, + { + "epoch": 3.321775312066574, + "grad_norm": 0.43033083528732036, + "learning_rate": 3.0567107638994775e-06, + "loss": 0.3731, + "step": 4790 + }, + { + "epoch": 3.32246879334258, + "grad_norm": 0.4876268695971904, + "learning_rate": 3.0544805961304203e-06, + "loss": 0.3594, + "step": 4791 + }, + { + "epoch": 3.323162274618585, + "grad_norm": 0.36190890433520706, + "learning_rate": 3.0522508843770217e-06, + "loss": 0.3665, + "step": 4792 + }, + { + "epoch": 3.323855755894591, + "grad_norm": 0.3784287080428108, + "learning_rate": 3.05002162916191e-06, + "loss": 0.3573, + "step": 4793 + }, + { + "epoch": 3.3245492371705962, + "grad_norm": 0.41364371948826995, + "learning_rate": 3.0477928310076066e-06, + "loss": 0.3925, + "step": 4794 + }, + { + "epoch": 3.325242718446602, + "grad_norm": 0.35301376080114627, + "learning_rate": 3.0455644904365234e-06, + "loss": 0.3451, + "step": 4795 + }, + { + "epoch": 3.3259361997226073, + "grad_norm": 0.3908244398272018, + "learning_rate": 3.0433366079709705e-06, + "loss": 0.3439, + "step": 4796 + }, + { + "epoch": 3.326629680998613, + "grad_norm": 0.3793879081130786, + "learning_rate": 3.0411091841331454e-06, + "loss": 0.3678, + "step": 4797 + }, + { + "epoch": 3.3273231622746184, + "grad_norm": 0.3869478050302743, + "learning_rate": 3.0388822194451385e-06, + "loss": 0.3837, + "step": 4798 + }, + { + "epoch": 3.328016643550624, + "grad_norm": 0.4083021642711554, + "learning_rate": 3.036655714428939e-06, + "loss": 0.3455, + "step": 4799 + }, + { + "epoch": 3.3287101248266295, + "grad_norm": 0.38226088788306095, + "learning_rate": 3.034429669606419e-06, + "loss": 0.3493, + "step": 4800 + }, + { + "epoch": 3.3294036061026353, + "grad_norm": 0.4812896414464564, + "learning_rate": 3.0322040854993508e-06, + "loss": 0.3887, + "step": 4801 + }, + { + "epoch": 3.3300970873786406, + "grad_norm": 0.38464444585631524, + "learning_rate": 3.029978962629393e-06, + "loss": 0.3692, + "step": 4802 + }, + { + "epoch": 3.3307905686546464, + "grad_norm": 0.3615496548988666, + "learning_rate": 3.0277543015180976e-06, + "loss": 0.3929, + "step": 4803 + }, + { + "epoch": 3.3314840499306517, + "grad_norm": 0.4377609914098718, + "learning_rate": 3.0255301026869118e-06, + "loss": 0.3566, + "step": 4804 + }, + { + "epoch": 3.3321775312066575, + "grad_norm": 0.4003491932807638, + "learning_rate": 3.02330636665717e-06, + "loss": 0.3875, + "step": 4805 + }, + { + "epoch": 3.332871012482663, + "grad_norm": 0.39813084971419965, + "learning_rate": 3.0210830939501e-06, + "loss": 0.3799, + "step": 4806 + }, + { + "epoch": 3.3335644937586686, + "grad_norm": 0.4009090654200237, + "learning_rate": 3.0188602850868186e-06, + "loss": 0.3765, + "step": 4807 + }, + { + "epoch": 3.334257975034674, + "grad_norm": 0.4211772999469095, + "learning_rate": 3.0166379405883394e-06, + "loss": 0.3624, + "step": 4808 + }, + { + "epoch": 3.3349514563106797, + "grad_norm": 0.39689042013027537, + "learning_rate": 3.0144160609755635e-06, + "loss": 0.4124, + "step": 4809 + }, + { + "epoch": 3.335644937586685, + "grad_norm": 0.4267410643883082, + "learning_rate": 3.012194646769283e-06, + "loss": 0.3651, + "step": 4810 + }, + { + "epoch": 3.336338418862691, + "grad_norm": 0.38128040683865877, + "learning_rate": 3.0099736984901806e-06, + "loss": 0.3916, + "step": 4811 + }, + { + "epoch": 3.337031900138696, + "grad_norm": 0.41167653379599123, + "learning_rate": 3.00775321665883e-06, + "loss": 0.3926, + "step": 4812 + }, + { + "epoch": 3.337725381414702, + "grad_norm": 0.38515492164328397, + "learning_rate": 3.0055332017956984e-06, + "loss": 0.4075, + "step": 4813 + }, + { + "epoch": 3.338418862690707, + "grad_norm": 0.3928173736721108, + "learning_rate": 3.0033136544211387e-06, + "loss": 0.387, + "step": 4814 + }, + { + "epoch": 3.339112343966713, + "grad_norm": 0.4052209320438356, + "learning_rate": 3.0010945750553975e-06, + "loss": 0.3979, + "step": 4815 + }, + { + "epoch": 3.3398058252427183, + "grad_norm": 0.38808232379758656, + "learning_rate": 2.99887596421861e-06, + "loss": 0.3647, + "step": 4816 + }, + { + "epoch": 3.340499306518724, + "grad_norm": 0.38593592589078973, + "learning_rate": 2.9966578224308053e-06, + "loss": 0.3838, + "step": 4817 + }, + { + "epoch": 3.3411927877947294, + "grad_norm": 0.38211435077861855, + "learning_rate": 2.9944401502118987e-06, + "loss": 0.4043, + "step": 4818 + }, + { + "epoch": 3.341886269070735, + "grad_norm": 0.492156552703435, + "learning_rate": 2.9922229480816956e-06, + "loss": 0.3818, + "step": 4819 + }, + { + "epoch": 3.3425797503467405, + "grad_norm": 0.4168359745761171, + "learning_rate": 2.9900062165598916e-06, + "loss": 0.3346, + "step": 4820 + }, + { + "epoch": 3.3432732316227463, + "grad_norm": 0.42075187556768207, + "learning_rate": 2.987789956166074e-06, + "loss": 0.4033, + "step": 4821 + }, + { + "epoch": 3.3439667128987516, + "grad_norm": 0.4143576158340521, + "learning_rate": 2.9855741674197182e-06, + "loss": 0.3969, + "step": 4822 + }, + { + "epoch": 3.3446601941747574, + "grad_norm": 0.4120220938954012, + "learning_rate": 2.983358850840187e-06, + "loss": 0.3981, + "step": 4823 + }, + { + "epoch": 3.3453536754507627, + "grad_norm": 0.40816273604396464, + "learning_rate": 2.9811440069467367e-06, + "loss": 0.4046, + "step": 4824 + }, + { + "epoch": 3.3460471567267684, + "grad_norm": 0.4000688578661085, + "learning_rate": 2.9789296362585084e-06, + "loss": 0.3502, + "step": 4825 + }, + { + "epoch": 3.3467406380027738, + "grad_norm": 0.41682558150661075, + "learning_rate": 2.9767157392945378e-06, + "loss": 0.398, + "step": 4826 + }, + { + "epoch": 3.3474341192787795, + "grad_norm": 0.3921429805180197, + "learning_rate": 2.9745023165737445e-06, + "loss": 0.3713, + "step": 4827 + }, + { + "epoch": 3.348127600554785, + "grad_norm": 0.36688239420464774, + "learning_rate": 2.9722893686149377e-06, + "loss": 0.3736, + "step": 4828 + }, + { + "epoch": 3.3488210818307906, + "grad_norm": 0.3887522990258237, + "learning_rate": 2.9700768959368196e-06, + "loss": 0.3373, + "step": 4829 + }, + { + "epoch": 3.349514563106796, + "grad_norm": 0.5145190725681971, + "learning_rate": 2.967864899057975e-06, + "loss": 0.4464, + "step": 4830 + }, + { + "epoch": 3.3502080443828017, + "grad_norm": 0.4339586706330397, + "learning_rate": 2.9656533784968804e-06, + "loss": 0.4072, + "step": 4831 + }, + { + "epoch": 3.350901525658807, + "grad_norm": 0.4060396537410531, + "learning_rate": 2.9634423347718998e-06, + "loss": 0.4047, + "step": 4832 + }, + { + "epoch": 3.351595006934813, + "grad_norm": 0.3928970993760514, + "learning_rate": 2.961231768401287e-06, + "loss": 0.3436, + "step": 4833 + }, + { + "epoch": 3.352288488210818, + "grad_norm": 0.4517601326555857, + "learning_rate": 2.9590216799031814e-06, + "loss": 0.4042, + "step": 4834 + }, + { + "epoch": 3.352981969486824, + "grad_norm": 0.44926535318849076, + "learning_rate": 2.9568120697956137e-06, + "loss": 0.4021, + "step": 4835 + }, + { + "epoch": 3.3536754507628292, + "grad_norm": 0.4066676413216299, + "learning_rate": 2.954602938596499e-06, + "loss": 0.4197, + "step": 4836 + }, + { + "epoch": 3.354368932038835, + "grad_norm": 0.610090112849432, + "learning_rate": 2.9523942868236414e-06, + "loss": 0.3733, + "step": 4837 + }, + { + "epoch": 3.3550624133148403, + "grad_norm": 0.4016903449405014, + "learning_rate": 2.9501861149947347e-06, + "loss": 0.3561, + "step": 4838 + }, + { + "epoch": 3.355755894590846, + "grad_norm": 0.3941258843057235, + "learning_rate": 2.9479784236273572e-06, + "loss": 0.3414, + "step": 4839 + }, + { + "epoch": 3.3564493758668514, + "grad_norm": 0.43253920163960613, + "learning_rate": 2.945771213238975e-06, + "loss": 0.3504, + "step": 4840 + }, + { + "epoch": 3.357142857142857, + "grad_norm": 0.35929961615809525, + "learning_rate": 2.9435644843469434e-06, + "loss": 0.3674, + "step": 4841 + }, + { + "epoch": 3.3578363384188625, + "grad_norm": 0.4007135177687099, + "learning_rate": 2.9413582374685036e-06, + "loss": 0.3476, + "step": 4842 + }, + { + "epoch": 3.3585298196948683, + "grad_norm": 0.4080473708867387, + "learning_rate": 2.939152473120781e-06, + "loss": 0.3378, + "step": 4843 + }, + { + "epoch": 3.3592233009708736, + "grad_norm": 0.3946858676555683, + "learning_rate": 2.936947191820796e-06, + "loss": 0.3891, + "step": 4844 + }, + { + "epoch": 3.3599167822468794, + "grad_norm": 0.40830885659396526, + "learning_rate": 2.934742394085447e-06, + "loss": 0.3665, + "step": 4845 + }, + { + "epoch": 3.3606102635228847, + "grad_norm": 0.3958600097382781, + "learning_rate": 2.932538080431524e-06, + "loss": 0.4472, + "step": 4846 + }, + { + "epoch": 3.3613037447988905, + "grad_norm": 0.40731749546536183, + "learning_rate": 2.9303342513757023e-06, + "loss": 0.3624, + "step": 4847 + }, + { + "epoch": 3.361997226074896, + "grad_norm": 0.39258631435835023, + "learning_rate": 2.928130907434541e-06, + "loss": 0.36, + "step": 4848 + }, + { + "epoch": 3.3626907073509016, + "grad_norm": 0.4436248182024021, + "learning_rate": 2.925928049124491e-06, + "loss": 0.3855, + "step": 4849 + }, + { + "epoch": 3.363384188626907, + "grad_norm": 0.4250106708461144, + "learning_rate": 2.923725676961886e-06, + "loss": 0.3486, + "step": 4850 + }, + { + "epoch": 3.3640776699029127, + "grad_norm": 0.8206220779206658, + "learning_rate": 2.9215237914629445e-06, + "loss": 0.3843, + "step": 4851 + }, + { + "epoch": 3.364771151178918, + "grad_norm": 0.34570622275469653, + "learning_rate": 2.919322393143772e-06, + "loss": 0.3341, + "step": 4852 + }, + { + "epoch": 3.3654646324549238, + "grad_norm": 0.3837678150132038, + "learning_rate": 2.9171214825203626e-06, + "loss": 0.3754, + "step": 4853 + }, + { + "epoch": 3.366158113730929, + "grad_norm": 0.39575493565391595, + "learning_rate": 2.914921060108592e-06, + "loss": 0.4418, + "step": 4854 + }, + { + "epoch": 3.366851595006935, + "grad_norm": 0.42549432184618546, + "learning_rate": 2.9127211264242244e-06, + "loss": 0.3677, + "step": 4855 + }, + { + "epoch": 3.36754507628294, + "grad_norm": 0.3822340361511247, + "learning_rate": 2.9105216819829094e-06, + "loss": 0.3625, + "step": 4856 + }, + { + "epoch": 3.368238557558946, + "grad_norm": 0.36753243020092696, + "learning_rate": 2.9083227273001784e-06, + "loss": 0.3836, + "step": 4857 + }, + { + "epoch": 3.3689320388349513, + "grad_norm": 0.504998469652746, + "learning_rate": 2.906124262891451e-06, + "loss": 0.3989, + "step": 4858 + }, + { + "epoch": 3.369625520110957, + "grad_norm": 0.39073449430611673, + "learning_rate": 2.9039262892720338e-06, + "loss": 0.3717, + "step": 4859 + }, + { + "epoch": 3.3703190013869624, + "grad_norm": 0.3863673296401061, + "learning_rate": 2.9017288069571114e-06, + "loss": 0.3783, + "step": 4860 + }, + { + "epoch": 3.371012482662968, + "grad_norm": 0.3908072330064723, + "learning_rate": 2.8995318164617614e-06, + "loss": 0.3549, + "step": 4861 + }, + { + "epoch": 3.3717059639389735, + "grad_norm": 0.45745098507551407, + "learning_rate": 2.89733531830094e-06, + "loss": 0.3748, + "step": 4862 + }, + { + "epoch": 3.3723994452149793, + "grad_norm": 0.3936236154645762, + "learning_rate": 2.8951393129894928e-06, + "loss": 0.3571, + "step": 4863 + }, + { + "epoch": 3.3730929264909846, + "grad_norm": 0.4247916075005123, + "learning_rate": 2.8929438010421486e-06, + "loss": 0.4084, + "step": 4864 + }, + { + "epoch": 3.3737864077669903, + "grad_norm": 0.42947294425671784, + "learning_rate": 2.8907487829735147e-06, + "loss": 0.3714, + "step": 4865 + }, + { + "epoch": 3.3744798890429957, + "grad_norm": 0.6270453252780668, + "learning_rate": 2.888554259298092e-06, + "loss": 0.3945, + "step": 4866 + }, + { + "epoch": 3.3751733703190014, + "grad_norm": 0.46125027737791646, + "learning_rate": 2.886360230530258e-06, + "loss": 0.403, + "step": 4867 + }, + { + "epoch": 3.3758668515950068, + "grad_norm": 0.416072507615972, + "learning_rate": 2.8841666971842776e-06, + "loss": 0.3726, + "step": 4868 + }, + { + "epoch": 3.3765603328710125, + "grad_norm": 0.4009550156039696, + "learning_rate": 2.881973659774302e-06, + "loss": 0.4169, + "step": 4869 + }, + { + "epoch": 3.377253814147018, + "grad_norm": 0.35906874390525434, + "learning_rate": 2.8797811188143572e-06, + "loss": 0.3707, + "step": 4870 + }, + { + "epoch": 3.3779472954230236, + "grad_norm": 0.3779197767370708, + "learning_rate": 2.8775890748183666e-06, + "loss": 0.3243, + "step": 4871 + }, + { + "epoch": 3.378640776699029, + "grad_norm": 0.39668593449340256, + "learning_rate": 2.8753975283001232e-06, + "loss": 0.3681, + "step": 4872 + }, + { + "epoch": 3.3793342579750347, + "grad_norm": 0.42130463516499855, + "learning_rate": 2.873206479773313e-06, + "loss": 0.3405, + "step": 4873 + }, + { + "epoch": 3.38002773925104, + "grad_norm": 0.38727094281194896, + "learning_rate": 2.8710159297515027e-06, + "loss": 0.4041, + "step": 4874 + }, + { + "epoch": 3.380721220527046, + "grad_norm": 0.38543446372078244, + "learning_rate": 2.8688258787481376e-06, + "loss": 0.4067, + "step": 4875 + }, + { + "epoch": 3.381414701803051, + "grad_norm": 0.3813864971120607, + "learning_rate": 2.866636327276552e-06, + "loss": 0.3725, + "step": 4876 + }, + { + "epoch": 3.382108183079057, + "grad_norm": 0.3870088894913858, + "learning_rate": 2.864447275849962e-06, + "loss": 0.347, + "step": 4877 + }, + { + "epoch": 3.3828016643550622, + "grad_norm": 0.35426576167665785, + "learning_rate": 2.8622587249814625e-06, + "loss": 0.3716, + "step": 4878 + }, + { + "epoch": 3.383495145631068, + "grad_norm": 0.39258220662106075, + "learning_rate": 2.860070675184036e-06, + "loss": 0.369, + "step": 4879 + }, + { + "epoch": 3.3841886269070733, + "grad_norm": 0.38958571880739534, + "learning_rate": 2.8578831269705454e-06, + "loss": 0.4157, + "step": 4880 + }, + { + "epoch": 3.384882108183079, + "grad_norm": 0.40299059748768185, + "learning_rate": 2.855696080853735e-06, + "loss": 0.3986, + "step": 4881 + }, + { + "epoch": 3.3855755894590844, + "grad_norm": 0.4756806532688594, + "learning_rate": 2.853509537346236e-06, + "loss": 0.3539, + "step": 4882 + }, + { + "epoch": 3.38626907073509, + "grad_norm": 0.40915831012982906, + "learning_rate": 2.8513234969605534e-06, + "loss": 0.3881, + "step": 4883 + }, + { + "epoch": 3.3869625520110955, + "grad_norm": 0.42051132510023564, + "learning_rate": 2.8491379602090816e-06, + "loss": 0.3443, + "step": 4884 + }, + { + "epoch": 3.3876560332871013, + "grad_norm": 0.41574227309923295, + "learning_rate": 2.8469529276040976e-06, + "loss": 0.3659, + "step": 4885 + }, + { + "epoch": 3.3883495145631066, + "grad_norm": 0.3801177312434935, + "learning_rate": 2.8447683996577513e-06, + "loss": 0.397, + "step": 4886 + }, + { + "epoch": 3.3890429958391124, + "grad_norm": 0.3922759108182902, + "learning_rate": 2.8425843768820838e-06, + "loss": 0.3889, + "step": 4887 + }, + { + "epoch": 3.3897364771151177, + "grad_norm": 0.3738034887792933, + "learning_rate": 2.840400859789013e-06, + "loss": 0.3271, + "step": 4888 + }, + { + "epoch": 3.3904299583911235, + "grad_norm": 0.4141784960468706, + "learning_rate": 2.838217848890341e-06, + "loss": 0.381, + "step": 4889 + }, + { + "epoch": 3.391123439667129, + "grad_norm": 0.38936755920522714, + "learning_rate": 2.8360353446977505e-06, + "loss": 0.3926, + "step": 4890 + }, + { + "epoch": 3.3918169209431346, + "grad_norm": 0.4371766590483668, + "learning_rate": 2.8338533477228007e-06, + "loss": 0.3683, + "step": 4891 + }, + { + "epoch": 3.39251040221914, + "grad_norm": 0.40790824562218736, + "learning_rate": 2.8316718584769385e-06, + "loss": 0.3726, + "step": 4892 + }, + { + "epoch": 3.3932038834951457, + "grad_norm": 0.38681992444589886, + "learning_rate": 2.829490877471491e-06, + "loss": 0.3519, + "step": 4893 + }, + { + "epoch": 3.393897364771151, + "grad_norm": 0.37556829310744894, + "learning_rate": 2.8273104052176603e-06, + "loss": 0.3824, + "step": 4894 + }, + { + "epoch": 3.3945908460471568, + "grad_norm": 0.4443909455244169, + "learning_rate": 2.8251304422265347e-06, + "loss": 0.3719, + "step": 4895 + }, + { + "epoch": 3.395284327323162, + "grad_norm": 0.4118663188699836, + "learning_rate": 2.8229509890090843e-06, + "loss": 0.3945, + "step": 4896 + }, + { + "epoch": 3.395977808599168, + "grad_norm": 0.38264484392423737, + "learning_rate": 2.8207720460761523e-06, + "loss": 0.3732, + "step": 4897 + }, + { + "epoch": 3.396671289875173, + "grad_norm": 0.3910796509452467, + "learning_rate": 2.8185936139384727e-06, + "loss": 0.3447, + "step": 4898 + }, + { + "epoch": 3.397364771151179, + "grad_norm": 0.40475057627631883, + "learning_rate": 2.81641569310665e-06, + "loss": 0.3923, + "step": 4899 + }, + { + "epoch": 3.3980582524271843, + "grad_norm": 0.46432757412272163, + "learning_rate": 2.8142382840911747e-06, + "loss": 0.3981, + "step": 4900 + }, + { + "epoch": 3.39875173370319, + "grad_norm": 0.44219148564980687, + "learning_rate": 2.8120613874024173e-06, + "loss": 0.4332, + "step": 4901 + }, + { + "epoch": 3.3994452149791954, + "grad_norm": 0.38875900013315995, + "learning_rate": 2.809885003550623e-06, + "loss": 0.3809, + "step": 4902 + }, + { + "epoch": 3.400138696255201, + "grad_norm": 0.3999033233778416, + "learning_rate": 2.8077091330459225e-06, + "loss": 0.3422, + "step": 4903 + }, + { + "epoch": 3.4008321775312065, + "grad_norm": 0.7729474720429104, + "learning_rate": 2.805533776398326e-06, + "loss": 0.3685, + "step": 4904 + }, + { + "epoch": 3.4015256588072122, + "grad_norm": 0.39594443974714943, + "learning_rate": 2.803358934117717e-06, + "loss": 0.3821, + "step": 4905 + }, + { + "epoch": 3.4022191400832176, + "grad_norm": 0.4427389632249658, + "learning_rate": 2.8011846067138648e-06, + "loss": 0.3678, + "step": 4906 + }, + { + "epoch": 3.4029126213592233, + "grad_norm": 0.38383049263298413, + "learning_rate": 2.7990107946964163e-06, + "loss": 0.3722, + "step": 4907 + }, + { + "epoch": 3.4036061026352287, + "grad_norm": 0.3771531191666602, + "learning_rate": 2.7968374985748977e-06, + "loss": 0.3965, + "step": 4908 + }, + { + "epoch": 3.4042995839112344, + "grad_norm": 0.45903971744526006, + "learning_rate": 2.794664718858715e-06, + "loss": 0.3506, + "step": 4909 + }, + { + "epoch": 3.4049930651872398, + "grad_norm": 0.41421234277571894, + "learning_rate": 2.792492456057148e-06, + "loss": 0.3829, + "step": 4910 + }, + { + "epoch": 3.4056865464632455, + "grad_norm": 0.402475409613642, + "learning_rate": 2.7903207106793646e-06, + "loss": 0.4218, + "step": 4911 + }, + { + "epoch": 3.406380027739251, + "grad_norm": 0.39327737538667384, + "learning_rate": 2.7881494832344008e-06, + "loss": 0.3793, + "step": 4912 + }, + { + "epoch": 3.4070735090152566, + "grad_norm": 0.49336222269509616, + "learning_rate": 2.7859787742311794e-06, + "loss": 0.42, + "step": 4913 + }, + { + "epoch": 3.407766990291262, + "grad_norm": 0.3943370551289641, + "learning_rate": 2.7838085841785005e-06, + "loss": 0.3867, + "step": 4914 + }, + { + "epoch": 3.4084604715672677, + "grad_norm": 0.42733581194457265, + "learning_rate": 2.7816389135850353e-06, + "loss": 0.4147, + "step": 4915 + }, + { + "epoch": 3.409153952843273, + "grad_norm": 0.38723319973204884, + "learning_rate": 2.7794697629593457e-06, + "loss": 0.3829, + "step": 4916 + }, + { + "epoch": 3.409847434119279, + "grad_norm": 0.44312960703892645, + "learning_rate": 2.777301132809861e-06, + "loss": 0.4378, + "step": 4917 + }, + { + "epoch": 3.410540915395284, + "grad_norm": 0.4333836906210958, + "learning_rate": 2.775133023644893e-06, + "loss": 0.3976, + "step": 4918 + }, + { + "epoch": 3.41123439667129, + "grad_norm": 0.3848660625105505, + "learning_rate": 2.7729654359726327e-06, + "loss": 0.353, + "step": 4919 + }, + { + "epoch": 3.4119278779472952, + "grad_norm": 0.3975276218469039, + "learning_rate": 2.770798370301143e-06, + "loss": 0.4022, + "step": 4920 + }, + { + "epoch": 3.412621359223301, + "grad_norm": 0.3826030067414858, + "learning_rate": 2.7686318271383717e-06, + "loss": 0.3513, + "step": 4921 + }, + { + "epoch": 3.4133148404993063, + "grad_norm": 0.5153097547808159, + "learning_rate": 2.7664658069921415e-06, + "loss": 0.4003, + "step": 4922 + }, + { + "epoch": 3.414008321775312, + "grad_norm": 0.388803719858887, + "learning_rate": 2.764300310370147e-06, + "loss": 0.3933, + "step": 4923 + }, + { + "epoch": 3.4147018030513174, + "grad_norm": 0.37902493217826716, + "learning_rate": 2.762135337779969e-06, + "loss": 0.4015, + "step": 4924 + }, + { + "epoch": 3.415395284327323, + "grad_norm": 0.449218891198431, + "learning_rate": 2.75997088972906e-06, + "loss": 0.3987, + "step": 4925 + }, + { + "epoch": 3.4160887656033285, + "grad_norm": 0.40938851019963973, + "learning_rate": 2.757806966724752e-06, + "loss": 0.3932, + "step": 4926 + }, + { + "epoch": 3.4167822468793343, + "grad_norm": 0.4075548869080461, + "learning_rate": 2.755643569274254e-06, + "loss": 0.3754, + "step": 4927 + }, + { + "epoch": 3.4174757281553396, + "grad_norm": 0.43239540728189507, + "learning_rate": 2.753480697884647e-06, + "loss": 0.3835, + "step": 4928 + }, + { + "epoch": 3.4181692094313454, + "grad_norm": 0.465700034039619, + "learning_rate": 2.751318353062894e-06, + "loss": 0.393, + "step": 4929 + }, + { + "epoch": 3.4188626907073507, + "grad_norm": 0.39435980690696476, + "learning_rate": 2.7491565353158356e-06, + "loss": 0.4192, + "step": 4930 + }, + { + "epoch": 3.4195561719833565, + "grad_norm": 0.39235831769813967, + "learning_rate": 2.7469952451501825e-06, + "loss": 0.3703, + "step": 4931 + }, + { + "epoch": 3.420249653259362, + "grad_norm": 0.4332464961711542, + "learning_rate": 2.744834483072526e-06, + "loss": 0.3784, + "step": 4932 + }, + { + "epoch": 3.4209431345353676, + "grad_norm": 0.4361363296596932, + "learning_rate": 2.7426742495893343e-06, + "loss": 0.4181, + "step": 4933 + }, + { + "epoch": 3.421636615811373, + "grad_norm": 0.4211794105331103, + "learning_rate": 2.7405145452069505e-06, + "loss": 0.3956, + "step": 4934 + }, + { + "epoch": 3.4223300970873787, + "grad_norm": 0.3920133142444421, + "learning_rate": 2.7383553704315946e-06, + "loss": 0.3682, + "step": 4935 + }, + { + "epoch": 3.423023578363384, + "grad_norm": 0.41823077310912277, + "learning_rate": 2.736196725769359e-06, + "loss": 0.3589, + "step": 4936 + }, + { + "epoch": 3.4237170596393898, + "grad_norm": 0.507199572755019, + "learning_rate": 2.734038611726215e-06, + "loss": 0.3763, + "step": 4937 + }, + { + "epoch": 3.424410540915395, + "grad_norm": 0.36561853832034596, + "learning_rate": 2.731881028808012e-06, + "loss": 0.333, + "step": 4938 + }, + { + "epoch": 3.425104022191401, + "grad_norm": 0.37067240687619535, + "learning_rate": 2.7297239775204674e-06, + "loss": 0.3569, + "step": 4939 + }, + { + "epoch": 3.425797503467406, + "grad_norm": 0.3661613510657694, + "learning_rate": 2.7275674583691804e-06, + "loss": 0.3578, + "step": 4940 + }, + { + "epoch": 3.426490984743412, + "grad_norm": 0.3799890045841063, + "learning_rate": 2.7254114718596253e-06, + "loss": 0.3714, + "step": 4941 + }, + { + "epoch": 3.4271844660194173, + "grad_norm": 0.4125765941311352, + "learning_rate": 2.7232560184971437e-06, + "loss": 0.393, + "step": 4942 + }, + { + "epoch": 3.427877947295423, + "grad_norm": 0.39792676362304097, + "learning_rate": 2.721101098786967e-06, + "loss": 0.3662, + "step": 4943 + }, + { + "epoch": 3.4285714285714284, + "grad_norm": 0.43528143961658444, + "learning_rate": 2.718946713234185e-06, + "loss": 0.3692, + "step": 4944 + }, + { + "epoch": 3.429264909847434, + "grad_norm": 0.37779663047146134, + "learning_rate": 2.7167928623437744e-06, + "loss": 0.334, + "step": 4945 + }, + { + "epoch": 3.4299583911234395, + "grad_norm": 0.3623910565481717, + "learning_rate": 2.714639546620582e-06, + "loss": 0.3706, + "step": 4946 + }, + { + "epoch": 3.4306518723994452, + "grad_norm": 0.4217422480863894, + "learning_rate": 2.7124867665693276e-06, + "loss": 0.388, + "step": 4947 + }, + { + "epoch": 3.4313453536754506, + "grad_norm": 0.38209248965279885, + "learning_rate": 2.71033452269461e-06, + "loss": 0.31, + "step": 4948 + }, + { + "epoch": 3.4320388349514563, + "grad_norm": 0.35687687047638755, + "learning_rate": 2.7081828155008953e-06, + "loss": 0.3556, + "step": 4949 + }, + { + "epoch": 3.4327323162274617, + "grad_norm": 0.38785585870490286, + "learning_rate": 2.7060316454925305e-06, + "loss": 0.3894, + "step": 4950 + }, + { + "epoch": 3.4334257975034674, + "grad_norm": 0.404601808598807, + "learning_rate": 2.7038810131737346e-06, + "loss": 0.3734, + "step": 4951 + }, + { + "epoch": 3.4341192787794728, + "grad_norm": 0.4191797723449136, + "learning_rate": 2.7017309190486e-06, + "loss": 0.4103, + "step": 4952 + }, + { + "epoch": 3.4348127600554785, + "grad_norm": 0.39661398154454475, + "learning_rate": 2.699581363621093e-06, + "loss": 0.3202, + "step": 4953 + }, + { + "epoch": 3.435506241331484, + "grad_norm": 0.4036795626786731, + "learning_rate": 2.697432347395056e-06, + "loss": 0.3618, + "step": 4954 + }, + { + "epoch": 3.4361997226074896, + "grad_norm": 0.3989886522608835, + "learning_rate": 2.695283870874199e-06, + "loss": 0.3801, + "step": 4955 + }, + { + "epoch": 3.436893203883495, + "grad_norm": 0.38679606268716427, + "learning_rate": 2.693135934562113e-06, + "loss": 0.3684, + "step": 4956 + }, + { + "epoch": 3.4375866851595007, + "grad_norm": 0.40409128568604624, + "learning_rate": 2.6909885389622547e-06, + "loss": 0.4059, + "step": 4957 + }, + { + "epoch": 3.438280166435506, + "grad_norm": 0.3771051473781267, + "learning_rate": 2.68884168457796e-06, + "loss": 0.378, + "step": 4958 + }, + { + "epoch": 3.438973647711512, + "grad_norm": 0.3768436049736947, + "learning_rate": 2.6866953719124365e-06, + "loss": 0.3847, + "step": 4959 + }, + { + "epoch": 3.4396671289875176, + "grad_norm": 0.4149233474542711, + "learning_rate": 2.684549601468764e-06, + "loss": 0.3822, + "step": 4960 + }, + { + "epoch": 3.440360610263523, + "grad_norm": 0.4396941506657551, + "learning_rate": 2.6824043737498978e-06, + "loss": 0.4048, + "step": 4961 + }, + { + "epoch": 3.4410540915395282, + "grad_norm": 0.39699085268813994, + "learning_rate": 2.6802596892586595e-06, + "loss": 0.3679, + "step": 4962 + }, + { + "epoch": 3.441747572815534, + "grad_norm": 0.38707204105879717, + "learning_rate": 2.6781155484977495e-06, + "loss": 0.3818, + "step": 4963 + }, + { + "epoch": 3.4424410540915398, + "grad_norm": 0.40384547582505415, + "learning_rate": 2.6759719519697412e-06, + "loss": 0.3858, + "step": 4964 + }, + { + "epoch": 3.443134535367545, + "grad_norm": 0.36378350408684135, + "learning_rate": 2.673828900177074e-06, + "loss": 0.3171, + "step": 4965 + }, + { + "epoch": 3.4438280166435504, + "grad_norm": 0.4095288700423232, + "learning_rate": 2.671686393622066e-06, + "loss": 0.3504, + "step": 4966 + }, + { + "epoch": 3.444521497919556, + "grad_norm": 0.40593831978099953, + "learning_rate": 2.6695444328069063e-06, + "loss": 0.4187, + "step": 4967 + }, + { + "epoch": 3.445214979195562, + "grad_norm": 0.3777510446737168, + "learning_rate": 2.6674030182336496e-06, + "loss": 0.3572, + "step": 4968 + }, + { + "epoch": 3.4459084604715673, + "grad_norm": 0.38471942821892796, + "learning_rate": 2.6652621504042366e-06, + "loss": 0.3721, + "step": 4969 + }, + { + "epoch": 3.4466019417475726, + "grad_norm": 0.5303866671587186, + "learning_rate": 2.6631218298204643e-06, + "loss": 0.3707, + "step": 4970 + }, + { + "epoch": 3.4472954230235784, + "grad_norm": 0.38591395865593525, + "learning_rate": 2.6609820569840106e-06, + "loss": 0.375, + "step": 4971 + }, + { + "epoch": 3.447988904299584, + "grad_norm": 0.43278824202581095, + "learning_rate": 2.6588428323964243e-06, + "loss": 0.3654, + "step": 4972 + }, + { + "epoch": 3.4486823855755895, + "grad_norm": 0.48739101326818984, + "learning_rate": 2.656704156559121e-06, + "loss": 0.3612, + "step": 4973 + }, + { + "epoch": 3.449375866851595, + "grad_norm": 0.416971104948852, + "learning_rate": 2.6545660299733923e-06, + "loss": 0.3834, + "step": 4974 + }, + { + "epoch": 3.4500693481276006, + "grad_norm": 0.41300738954943433, + "learning_rate": 2.652428453140402e-06, + "loss": 0.3728, + "step": 4975 + }, + { + "epoch": 3.4507628294036063, + "grad_norm": 0.36040023295863355, + "learning_rate": 2.6502914265611783e-06, + "loss": 0.3127, + "step": 4976 + }, + { + "epoch": 3.4514563106796117, + "grad_norm": 0.36416957185809895, + "learning_rate": 2.6481549507366266e-06, + "loss": 0.3984, + "step": 4977 + }, + { + "epoch": 3.452149791955617, + "grad_norm": 0.41709294981582845, + "learning_rate": 2.6460190261675223e-06, + "loss": 0.3884, + "step": 4978 + }, + { + "epoch": 3.4528432732316228, + "grad_norm": 1.042961544375003, + "learning_rate": 2.6438836533545092e-06, + "loss": 0.3926, + "step": 4979 + }, + { + "epoch": 3.4535367545076285, + "grad_norm": 0.480741388193357, + "learning_rate": 2.641748832798107e-06, + "loss": 0.4261, + "step": 4980 + }, + { + "epoch": 3.454230235783634, + "grad_norm": 0.42416611325259135, + "learning_rate": 2.639614564998696e-06, + "loss": 0.382, + "step": 4981 + }, + { + "epoch": 3.454923717059639, + "grad_norm": 0.3548475153760887, + "learning_rate": 2.6374808504565363e-06, + "loss": 0.3577, + "step": 4982 + }, + { + "epoch": 3.455617198335645, + "grad_norm": 0.445091616710834, + "learning_rate": 2.6353476896717574e-06, + "loss": 0.3932, + "step": 4983 + }, + { + "epoch": 3.4563106796116507, + "grad_norm": 0.39396270646454185, + "learning_rate": 2.6332150831443524e-06, + "loss": 0.3721, + "step": 4984 + }, + { + "epoch": 3.457004160887656, + "grad_norm": 0.42989417337507274, + "learning_rate": 2.631083031374191e-06, + "loss": 0.3795, + "step": 4985 + }, + { + "epoch": 3.4576976421636614, + "grad_norm": 0.4607573109464507, + "learning_rate": 2.62895153486101e-06, + "loss": 0.3979, + "step": 4986 + }, + { + "epoch": 3.458391123439667, + "grad_norm": 0.37516883158470005, + "learning_rate": 2.626820594104418e-06, + "loss": 0.3632, + "step": 4987 + }, + { + "epoch": 3.459084604715673, + "grad_norm": 0.40405539795301204, + "learning_rate": 2.624690209603893e-06, + "loss": 0.3531, + "step": 4988 + }, + { + "epoch": 3.4597780859916782, + "grad_norm": 0.37632193303013134, + "learning_rate": 2.622560381858778e-06, + "loss": 0.375, + "step": 4989 + }, + { + "epoch": 3.4604715672676836, + "grad_norm": 0.3572816828055843, + "learning_rate": 2.620431111368291e-06, + "loss": 0.3211, + "step": 4990 + }, + { + "epoch": 3.4611650485436893, + "grad_norm": 0.42058442276817903, + "learning_rate": 2.6183023986315202e-06, + "loss": 0.3904, + "step": 4991 + }, + { + "epoch": 3.461858529819695, + "grad_norm": 0.3799187792338103, + "learning_rate": 2.6161742441474166e-06, + "loss": 0.3723, + "step": 4992 + }, + { + "epoch": 3.4625520110957004, + "grad_norm": 0.43632636713018264, + "learning_rate": 2.6140466484148074e-06, + "loss": 0.3779, + "step": 4993 + }, + { + "epoch": 3.4632454923717058, + "grad_norm": 0.41831324602323344, + "learning_rate": 2.6119196119323813e-06, + "loss": 0.3568, + "step": 4994 + }, + { + "epoch": 3.4639389736477115, + "grad_norm": 0.43314689696278474, + "learning_rate": 2.6097931351987014e-06, + "loss": 0.4197, + "step": 4995 + }, + { + "epoch": 3.4646324549237173, + "grad_norm": 0.3778873958768719, + "learning_rate": 2.6076672187122043e-06, + "loss": 0.3615, + "step": 4996 + }, + { + "epoch": 3.4653259361997226, + "grad_norm": 0.47570246175112035, + "learning_rate": 2.6055418629711825e-06, + "loss": 0.3961, + "step": 4997 + }, + { + "epoch": 3.466019417475728, + "grad_norm": 0.4443709246799145, + "learning_rate": 2.6034170684738065e-06, + "loss": 0.3573, + "step": 4998 + }, + { + "epoch": 3.4667128987517337, + "grad_norm": 0.4011150158884296, + "learning_rate": 2.6012928357181145e-06, + "loss": 0.3885, + "step": 4999 + }, + { + "epoch": 3.4674063800277395, + "grad_norm": 0.3628191538033867, + "learning_rate": 2.599169165202008e-06, + "loss": 0.3405, + "step": 5000 + }, + { + "epoch": 3.468099861303745, + "grad_norm": 0.37404681848891647, + "learning_rate": 2.5970460574232636e-06, + "loss": 0.3653, + "step": 5001 + }, + { + "epoch": 3.46879334257975, + "grad_norm": 0.4508620958026842, + "learning_rate": 2.594923512879518e-06, + "loss": 0.4145, + "step": 5002 + }, + { + "epoch": 3.469486823855756, + "grad_norm": 0.3770502950046798, + "learning_rate": 2.592801532068283e-06, + "loss": 0.349, + "step": 5003 + }, + { + "epoch": 3.4701803051317617, + "grad_norm": 0.40674648087365345, + "learning_rate": 2.5906801154869355e-06, + "loss": 0.3761, + "step": 5004 + }, + { + "epoch": 3.470873786407767, + "grad_norm": 0.4437004351728985, + "learning_rate": 2.588559263632719e-06, + "loss": 0.3744, + "step": 5005 + }, + { + "epoch": 3.4715672676837723, + "grad_norm": 0.40667005781382637, + "learning_rate": 2.586438977002749e-06, + "loss": 0.3388, + "step": 5006 + }, + { + "epoch": 3.472260748959778, + "grad_norm": 0.3999108972621037, + "learning_rate": 2.584319256094001e-06, + "loss": 0.3851, + "step": 5007 + }, + { + "epoch": 3.472954230235784, + "grad_norm": 0.4262331894520866, + "learning_rate": 2.582200101403324e-06, + "loss": 0.3799, + "step": 5008 + }, + { + "epoch": 3.473647711511789, + "grad_norm": 0.3845148925025503, + "learning_rate": 2.5800815134274347e-06, + "loss": 0.392, + "step": 5009 + }, + { + "epoch": 3.4743411927877945, + "grad_norm": 0.38613127077001597, + "learning_rate": 2.5779634926629103e-06, + "loss": 0.3953, + "step": 5010 + }, + { + "epoch": 3.4750346740638003, + "grad_norm": 0.3714306681631704, + "learning_rate": 2.575846039606203e-06, + "loss": 0.3167, + "step": 5011 + }, + { + "epoch": 3.475728155339806, + "grad_norm": 0.3610781947722351, + "learning_rate": 2.573729154753629e-06, + "loss": 0.3823, + "step": 5012 + }, + { + "epoch": 3.4764216366158114, + "grad_norm": 0.6946274530018958, + "learning_rate": 2.571612838601365e-06, + "loss": 0.3602, + "step": 5013 + }, + { + "epoch": 3.4771151178918167, + "grad_norm": 0.46945705057369436, + "learning_rate": 2.5694970916454686e-06, + "loss": 0.3707, + "step": 5014 + }, + { + "epoch": 3.4778085991678225, + "grad_norm": 0.4229556483807219, + "learning_rate": 2.56738191438185e-06, + "loss": 0.3836, + "step": 5015 + }, + { + "epoch": 3.4785020804438282, + "grad_norm": 0.41856241165707003, + "learning_rate": 2.565267307306292e-06, + "loss": 0.33, + "step": 5016 + }, + { + "epoch": 3.4791955617198336, + "grad_norm": 0.3624775573869822, + "learning_rate": 2.563153270914446e-06, + "loss": 0.389, + "step": 5017 + }, + { + "epoch": 3.479889042995839, + "grad_norm": 0.38960483916856403, + "learning_rate": 2.5610398057018235e-06, + "loss": 0.3649, + "step": 5018 + }, + { + "epoch": 3.4805825242718447, + "grad_norm": 0.4049643602143973, + "learning_rate": 2.558926912163807e-06, + "loss": 0.388, + "step": 5019 + }, + { + "epoch": 3.4812760055478504, + "grad_norm": 0.3903420159153272, + "learning_rate": 2.5568145907956443e-06, + "loss": 0.3794, + "step": 5020 + }, + { + "epoch": 3.4819694868238558, + "grad_norm": 0.4064956894266744, + "learning_rate": 2.5547028420924454e-06, + "loss": 0.3991, + "step": 5021 + }, + { + "epoch": 3.482662968099861, + "grad_norm": 0.4072342619633535, + "learning_rate": 2.5525916665491907e-06, + "loss": 0.3513, + "step": 5022 + }, + { + "epoch": 3.483356449375867, + "grad_norm": 0.369253497607641, + "learning_rate": 2.550481064660724e-06, + "loss": 0.3392, + "step": 5023 + }, + { + "epoch": 3.4840499306518726, + "grad_norm": 0.634110116014436, + "learning_rate": 2.548371036921756e-06, + "loss": 0.3435, + "step": 5024 + }, + { + "epoch": 3.484743411927878, + "grad_norm": 0.40864749564981667, + "learning_rate": 2.5462615838268636e-06, + "loss": 0.4043, + "step": 5025 + }, + { + "epoch": 3.4854368932038833, + "grad_norm": 0.4161750802751289, + "learning_rate": 2.544152705870483e-06, + "loss": 0.3953, + "step": 5026 + }, + { + "epoch": 3.486130374479889, + "grad_norm": 0.393875121572175, + "learning_rate": 2.5420444035469218e-06, + "loss": 0.4, + "step": 5027 + }, + { + "epoch": 3.486823855755895, + "grad_norm": 0.36920507839132877, + "learning_rate": 2.539936677350353e-06, + "loss": 0.3184, + "step": 5028 + }, + { + "epoch": 3.4875173370319, + "grad_norm": 0.3839595302637746, + "learning_rate": 2.5378295277748087e-06, + "loss": 0.3655, + "step": 5029 + }, + { + "epoch": 3.4882108183079055, + "grad_norm": 0.3731071529634874, + "learning_rate": 2.5357229553141904e-06, + "loss": 0.3978, + "step": 5030 + }, + { + "epoch": 3.4889042995839112, + "grad_norm": 0.40674519331511216, + "learning_rate": 2.533616960462265e-06, + "loss": 0.4112, + "step": 5031 + }, + { + "epoch": 3.489597780859917, + "grad_norm": 0.38259029857683174, + "learning_rate": 2.531511543712662e-06, + "loss": 0.3743, + "step": 5032 + }, + { + "epoch": 3.4902912621359223, + "grad_norm": 0.4530269312224613, + "learning_rate": 2.5294067055588765e-06, + "loss": 0.3712, + "step": 5033 + }, + { + "epoch": 3.4909847434119277, + "grad_norm": 0.417386544534514, + "learning_rate": 2.5273024464942654e-06, + "loss": 0.3516, + "step": 5034 + }, + { + "epoch": 3.4916782246879334, + "grad_norm": 0.4640925026518083, + "learning_rate": 2.5251987670120527e-06, + "loss": 0.3607, + "step": 5035 + }, + { + "epoch": 3.492371705963939, + "grad_norm": 0.4054169974866148, + "learning_rate": 2.523095667605327e-06, + "loss": 0.3772, + "step": 5036 + }, + { + "epoch": 3.4930651872399445, + "grad_norm": 0.42471608109592923, + "learning_rate": 2.5209931487670364e-06, + "loss": 0.4067, + "step": 5037 + }, + { + "epoch": 3.49375866851595, + "grad_norm": 0.44490904198146786, + "learning_rate": 2.51889121099e-06, + "loss": 0.3734, + "step": 5038 + }, + { + "epoch": 3.4944521497919556, + "grad_norm": 0.42878308948564786, + "learning_rate": 2.516789854766893e-06, + "loss": 0.368, + "step": 5039 + }, + { + "epoch": 3.4951456310679614, + "grad_norm": 0.4172092134103728, + "learning_rate": 2.5146890805902575e-06, + "loss": 0.3575, + "step": 5040 + }, + { + "epoch": 3.4958391123439667, + "grad_norm": 0.4330252916886809, + "learning_rate": 2.5125888889525057e-06, + "loss": 0.3891, + "step": 5041 + }, + { + "epoch": 3.496532593619972, + "grad_norm": 0.44966801182943367, + "learning_rate": 2.5104892803459024e-06, + "loss": 0.3717, + "step": 5042 + }, + { + "epoch": 3.497226074895978, + "grad_norm": 0.45401406890894613, + "learning_rate": 2.508390255262583e-06, + "loss": 0.3679, + "step": 5043 + }, + { + "epoch": 3.4979195561719836, + "grad_norm": 0.4090479354873949, + "learning_rate": 2.5062918141945412e-06, + "loss": 0.3682, + "step": 5044 + }, + { + "epoch": 3.498613037447989, + "grad_norm": 0.38637435812051035, + "learning_rate": 2.5041939576336383e-06, + "loss": 0.367, + "step": 5045 + }, + { + "epoch": 3.4993065187239942, + "grad_norm": 0.4553382546991837, + "learning_rate": 2.5020966860715978e-06, + "loss": 0.3932, + "step": 5046 + }, + { + "epoch": 3.5, + "grad_norm": 0.4139407927285363, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.3892, + "step": 5047 + }, + { + "epoch": 3.5006934812760058, + "grad_norm": 0.38301490117254405, + "learning_rate": 2.497903899910299e-06, + "loss": 0.3556, + "step": 5048 + }, + { + "epoch": 3.501386962552011, + "grad_norm": 0.3917710927109867, + "learning_rate": 2.4958083862938015e-06, + "loss": 0.4084, + "step": 5049 + }, + { + "epoch": 3.5020804438280164, + "grad_norm": 0.38857832797674746, + "learning_rate": 2.4937134596416823e-06, + "loss": 0.3733, + "step": 5050 + }, + { + "epoch": 3.502773925104022, + "grad_norm": 0.3881380743929215, + "learning_rate": 2.4916191204449785e-06, + "loss": 0.3907, + "step": 5051 + }, + { + "epoch": 3.503467406380028, + "grad_norm": 0.4486637336228026, + "learning_rate": 2.4895253691945847e-06, + "loss": 0.3558, + "step": 5052 + }, + { + "epoch": 3.5041608876560333, + "grad_norm": 0.4097794795215577, + "learning_rate": 2.487432206381262e-06, + "loss": 0.3652, + "step": 5053 + }, + { + "epoch": 3.5048543689320386, + "grad_norm": 1.3921924237059429, + "learning_rate": 2.4853396324956358e-06, + "loss": 0.43, + "step": 5054 + }, + { + "epoch": 3.5055478502080444, + "grad_norm": 0.41838093334476806, + "learning_rate": 2.4832476480281857e-06, + "loss": 0.3524, + "step": 5055 + }, + { + "epoch": 3.50624133148405, + "grad_norm": 0.41369193499130186, + "learning_rate": 2.4811562534692597e-06, + "loss": 0.3896, + "step": 5056 + }, + { + "epoch": 3.5069348127600555, + "grad_norm": 0.4979055135049944, + "learning_rate": 2.479065449309067e-06, + "loss": 0.4396, + "step": 5057 + }, + { + "epoch": 3.507628294036061, + "grad_norm": 0.39550964258047344, + "learning_rate": 2.4769752360376723e-06, + "loss": 0.414, + "step": 5058 + }, + { + "epoch": 3.5083217753120666, + "grad_norm": 0.378927606463655, + "learning_rate": 2.4748856141450132e-06, + "loss": 0.3528, + "step": 5059 + }, + { + "epoch": 3.5090152565880723, + "grad_norm": 0.3872949218780298, + "learning_rate": 2.472796584120877e-06, + "loss": 0.3634, + "step": 5060 + }, + { + "epoch": 3.5097087378640777, + "grad_norm": 0.397521433111934, + "learning_rate": 2.470708146454918e-06, + "loss": 0.3983, + "step": 5061 + }, + { + "epoch": 3.510402219140083, + "grad_norm": 0.38102694479794613, + "learning_rate": 2.4686203016366535e-06, + "loss": 0.3318, + "step": 5062 + }, + { + "epoch": 3.5110957004160888, + "grad_norm": 0.399446659559631, + "learning_rate": 2.4665330501554554e-06, + "loss": 0.3685, + "step": 5063 + }, + { + "epoch": 3.5117891816920945, + "grad_norm": 0.38685844927839425, + "learning_rate": 2.464446392500562e-06, + "loss": 0.3921, + "step": 5064 + }, + { + "epoch": 3.5124826629681, + "grad_norm": 0.3749103434827653, + "learning_rate": 2.462360329161073e-06, + "loss": 0.3747, + "step": 5065 + }, + { + "epoch": 3.513176144244105, + "grad_norm": 0.37880554878461997, + "learning_rate": 2.4602748606259424e-06, + "loss": 0.4149, + "step": 5066 + }, + { + "epoch": 3.513869625520111, + "grad_norm": 0.5888943881940791, + "learning_rate": 2.4581899873839903e-06, + "loss": 0.4102, + "step": 5067 + }, + { + "epoch": 3.5145631067961167, + "grad_norm": 0.4044519651208474, + "learning_rate": 2.4561057099238973e-06, + "loss": 0.3612, + "step": 5068 + }, + { + "epoch": 3.515256588072122, + "grad_norm": 0.36750360054231845, + "learning_rate": 2.4540220287342022e-06, + "loss": 0.3566, + "step": 5069 + }, + { + "epoch": 3.5159500693481274, + "grad_norm": 0.39329357708485335, + "learning_rate": 2.451938944303306e-06, + "loss": 0.4011, + "step": 5070 + }, + { + "epoch": 3.516643550624133, + "grad_norm": 0.3828839657383846, + "learning_rate": 2.449856457119466e-06, + "loss": 0.3475, + "step": 5071 + }, + { + "epoch": 3.517337031900139, + "grad_norm": 0.46731411794670946, + "learning_rate": 2.447774567670803e-06, + "loss": 0.3809, + "step": 5072 + }, + { + "epoch": 3.5180305131761442, + "grad_norm": 0.4043809708549121, + "learning_rate": 2.4456932764452995e-06, + "loss": 0.4005, + "step": 5073 + }, + { + "epoch": 3.5187239944521496, + "grad_norm": 0.37770022911936146, + "learning_rate": 2.4436125839307907e-06, + "loss": 0.3502, + "step": 5074 + }, + { + "epoch": 3.5194174757281553, + "grad_norm": 0.44234399066626795, + "learning_rate": 2.441532490614978e-06, + "loss": 0.3422, + "step": 5075 + }, + { + "epoch": 3.520110957004161, + "grad_norm": 0.3885938237966985, + "learning_rate": 2.43945299698542e-06, + "loss": 0.3714, + "step": 5076 + }, + { + "epoch": 3.5208044382801664, + "grad_norm": 0.40177653501425653, + "learning_rate": 2.4373741035295357e-06, + "loss": 0.3961, + "step": 5077 + }, + { + "epoch": 3.5214979195561718, + "grad_norm": 0.4000854666406545, + "learning_rate": 2.435295810734604e-06, + "loss": 0.4198, + "step": 5078 + }, + { + "epoch": 3.5221914008321775, + "grad_norm": 0.6035082194702799, + "learning_rate": 2.4332181190877573e-06, + "loss": 0.3566, + "step": 5079 + }, + { + "epoch": 3.5228848821081833, + "grad_norm": 0.4422150008665714, + "learning_rate": 2.4311410290759945e-06, + "loss": 0.3611, + "step": 5080 + }, + { + "epoch": 3.5235783633841886, + "grad_norm": 0.41796860801440094, + "learning_rate": 2.4290645411861717e-06, + "loss": 0.3918, + "step": 5081 + }, + { + "epoch": 3.524271844660194, + "grad_norm": 0.38059600356752527, + "learning_rate": 2.4269886559049995e-06, + "loss": 0.3654, + "step": 5082 + }, + { + "epoch": 3.5249653259361997, + "grad_norm": 0.4385285805622044, + "learning_rate": 2.4249133737190526e-06, + "loss": 0.4324, + "step": 5083 + }, + { + "epoch": 3.5256588072122055, + "grad_norm": 0.3824968793825981, + "learning_rate": 2.4228386951147596e-06, + "loss": 0.3807, + "step": 5084 + }, + { + "epoch": 3.526352288488211, + "grad_norm": 0.4058825410200674, + "learning_rate": 2.42076462057841e-06, + "loss": 0.3788, + "step": 5085 + }, + { + "epoch": 3.527045769764216, + "grad_norm": 0.408908026325552, + "learning_rate": 2.418691150596158e-06, + "loss": 0.3748, + "step": 5086 + }, + { + "epoch": 3.527739251040222, + "grad_norm": 0.4041296655093561, + "learning_rate": 2.416618285654003e-06, + "loss": 0.3462, + "step": 5087 + }, + { + "epoch": 3.5284327323162277, + "grad_norm": 0.42914773447142135, + "learning_rate": 2.4145460262378145e-06, + "loss": 0.387, + "step": 5088 + }, + { + "epoch": 3.529126213592233, + "grad_norm": 0.37428116045003906, + "learning_rate": 2.4124743728333106e-06, + "loss": 0.3722, + "step": 5089 + }, + { + "epoch": 3.5298196948682383, + "grad_norm": 0.36828207684842446, + "learning_rate": 2.4104033259260737e-06, + "loss": 0.3552, + "step": 5090 + }, + { + "epoch": 3.530513176144244, + "grad_norm": 0.3821252503893683, + "learning_rate": 2.408332886001545e-06, + "loss": 0.3779, + "step": 5091 + }, + { + "epoch": 3.53120665742025, + "grad_norm": 0.40320422676898987, + "learning_rate": 2.4062630535450156e-06, + "loss": 0.3897, + "step": 5092 + }, + { + "epoch": 3.531900138696255, + "grad_norm": 0.3957273762510209, + "learning_rate": 2.4041938290416416e-06, + "loss": 0.3917, + "step": 5093 + }, + { + "epoch": 3.5325936199722605, + "grad_norm": 0.4101799685605539, + "learning_rate": 2.402125212976435e-06, + "loss": 0.3782, + "step": 5094 + }, + { + "epoch": 3.5332871012482663, + "grad_norm": 0.4153548487891646, + "learning_rate": 2.4000572058342637e-06, + "loss": 0.3845, + "step": 5095 + }, + { + "epoch": 3.533980582524272, + "grad_norm": 0.4020625598523758, + "learning_rate": 2.3979898080998546e-06, + "loss": 0.3843, + "step": 5096 + }, + { + "epoch": 3.5346740638002774, + "grad_norm": 0.4637139216704794, + "learning_rate": 2.3959230202577893e-06, + "loss": 0.3767, + "step": 5097 + }, + { + "epoch": 3.5353675450762827, + "grad_norm": 0.39560653135499474, + "learning_rate": 2.3938568427925073e-06, + "loss": 0.3734, + "step": 5098 + }, + { + "epoch": 3.5360610263522885, + "grad_norm": 0.41952487368460134, + "learning_rate": 2.3917912761883092e-06, + "loss": 0.3838, + "step": 5099 + }, + { + "epoch": 3.5367545076282942, + "grad_norm": 0.4073243407179045, + "learning_rate": 2.3897263209293446e-06, + "loss": 0.3693, + "step": 5100 + }, + { + "epoch": 3.5374479889042996, + "grad_norm": 0.3939945386639667, + "learning_rate": 2.3876619774996263e-06, + "loss": 0.3393, + "step": 5101 + }, + { + "epoch": 3.538141470180305, + "grad_norm": 0.36974413315504706, + "learning_rate": 2.3855982463830222e-06, + "loss": 0.3259, + "step": 5102 + }, + { + "epoch": 3.5388349514563107, + "grad_norm": 0.42464960403658675, + "learning_rate": 2.3835351280632514e-06, + "loss": 0.3199, + "step": 5103 + }, + { + "epoch": 3.5395284327323164, + "grad_norm": 0.4219050841635451, + "learning_rate": 2.3814726230239006e-06, + "loss": 0.4182, + "step": 5104 + }, + { + "epoch": 3.5402219140083218, + "grad_norm": 0.4384143436449698, + "learning_rate": 2.379410731748401e-06, + "loss": 0.3716, + "step": 5105 + }, + { + "epoch": 3.540915395284327, + "grad_norm": 0.4283665946687072, + "learning_rate": 2.3773494547200463e-06, + "loss": 0.3936, + "step": 5106 + }, + { + "epoch": 3.541608876560333, + "grad_norm": 0.3967184806789085, + "learning_rate": 2.375288792421988e-06, + "loss": 0.3334, + "step": 5107 + }, + { + "epoch": 3.5423023578363386, + "grad_norm": 0.3948682884165533, + "learning_rate": 2.3732287453372254e-06, + "loss": 0.3801, + "step": 5108 + }, + { + "epoch": 3.542995839112344, + "grad_norm": 0.40342021294617914, + "learning_rate": 2.371169313948621e-06, + "loss": 0.4084, + "step": 5109 + }, + { + "epoch": 3.5436893203883493, + "grad_norm": 0.39350008152705535, + "learning_rate": 2.3691104987388923e-06, + "loss": 0.3347, + "step": 5110 + }, + { + "epoch": 3.544382801664355, + "grad_norm": 0.41690554675472585, + "learning_rate": 2.367052300190607e-06, + "loss": 0.3863, + "step": 5111 + }, + { + "epoch": 3.545076282940361, + "grad_norm": 0.43141064905132337, + "learning_rate": 2.364994718786194e-06, + "loss": 0.3758, + "step": 5112 + }, + { + "epoch": 3.545769764216366, + "grad_norm": 0.40127986009187966, + "learning_rate": 2.362937755007935e-06, + "loss": 0.4047, + "step": 5113 + }, + { + "epoch": 3.5464632454923715, + "grad_norm": 0.4010495150000716, + "learning_rate": 2.360881409337968e-06, + "loss": 0.3889, + "step": 5114 + }, + { + "epoch": 3.5471567267683772, + "grad_norm": 0.4127465105466682, + "learning_rate": 2.3588256822582874e-06, + "loss": 0.3607, + "step": 5115 + }, + { + "epoch": 3.547850208044383, + "grad_norm": 0.3774381045847109, + "learning_rate": 2.3567705742507364e-06, + "loss": 0.3763, + "step": 5116 + }, + { + "epoch": 3.5485436893203883, + "grad_norm": 0.4120927121466516, + "learning_rate": 2.3547160857970198e-06, + "loss": 0.3948, + "step": 5117 + }, + { + "epoch": 3.5492371705963937, + "grad_norm": 0.4292601858966053, + "learning_rate": 2.352662217378696e-06, + "loss": 0.3482, + "step": 5118 + }, + { + "epoch": 3.5499306518723994, + "grad_norm": 0.5319920019326687, + "learning_rate": 2.3506089694771737e-06, + "loss": 0.3561, + "step": 5119 + }, + { + "epoch": 3.550624133148405, + "grad_norm": 0.39384389412021664, + "learning_rate": 2.3485563425737234e-06, + "loss": 0.3281, + "step": 5120 + }, + { + "epoch": 3.5513176144244105, + "grad_norm": 0.3933910607219121, + "learning_rate": 2.34650433714946e-06, + "loss": 0.3821, + "step": 5121 + }, + { + "epoch": 3.552011095700416, + "grad_norm": 0.4656496497791498, + "learning_rate": 2.3444529536853645e-06, + "loss": 0.4006, + "step": 5122 + }, + { + "epoch": 3.5527045769764216, + "grad_norm": 0.4354125644486449, + "learning_rate": 2.342402192662266e-06, + "loss": 0.3535, + "step": 5123 + }, + { + "epoch": 3.5533980582524274, + "grad_norm": 0.37343046982874845, + "learning_rate": 2.3403520545608442e-06, + "loss": 0.3546, + "step": 5124 + }, + { + "epoch": 3.5540915395284327, + "grad_norm": 0.37100827250947604, + "learning_rate": 2.338302539861639e-06, + "loss": 0.3589, + "step": 5125 + }, + { + "epoch": 3.554785020804438, + "grad_norm": 0.3744864477167227, + "learning_rate": 2.3362536490450434e-06, + "loss": 0.3411, + "step": 5126 + }, + { + "epoch": 3.555478502080444, + "grad_norm": 0.3908935070654357, + "learning_rate": 2.3342053825912987e-06, + "loss": 0.3888, + "step": 5127 + }, + { + "epoch": 3.5561719833564496, + "grad_norm": 0.4093299461242767, + "learning_rate": 2.3321577409805074e-06, + "loss": 0.3393, + "step": 5128 + }, + { + "epoch": 3.556865464632455, + "grad_norm": 0.40844912948944956, + "learning_rate": 2.3301107246926187e-06, + "loss": 0.4082, + "step": 5129 + }, + { + "epoch": 3.5575589459084602, + "grad_norm": 0.3820327365173448, + "learning_rate": 2.3280643342074377e-06, + "loss": 0.401, + "step": 5130 + }, + { + "epoch": 3.558252427184466, + "grad_norm": 0.44093755257143197, + "learning_rate": 2.3260185700046295e-06, + "loss": 0.3968, + "step": 5131 + }, + { + "epoch": 3.5589459084604718, + "grad_norm": 0.3847255774776019, + "learning_rate": 2.3239734325637007e-06, + "loss": 0.3724, + "step": 5132 + }, + { + "epoch": 3.559639389736477, + "grad_norm": 0.36905387048496696, + "learning_rate": 2.3219289223640207e-06, + "loss": 0.3823, + "step": 5133 + }, + { + "epoch": 3.5603328710124824, + "grad_norm": 0.3920318487160868, + "learning_rate": 2.319885039884804e-06, + "loss": 0.3751, + "step": 5134 + }, + { + "epoch": 3.561026352288488, + "grad_norm": 0.4568631125919066, + "learning_rate": 2.3178417856051232e-06, + "loss": 0.3909, + "step": 5135 + }, + { + "epoch": 3.561719833564494, + "grad_norm": 0.3993058283619273, + "learning_rate": 2.3157991600039055e-06, + "loss": 0.4318, + "step": 5136 + }, + { + "epoch": 3.5624133148404993, + "grad_norm": 0.3888013021125074, + "learning_rate": 2.313757163559922e-06, + "loss": 0.3148, + "step": 5137 + }, + { + "epoch": 3.5631067961165046, + "grad_norm": 0.43380746426691436, + "learning_rate": 2.3117157967518052e-06, + "loss": 0.38, + "step": 5138 + }, + { + "epoch": 3.5638002773925104, + "grad_norm": 0.3689918620136303, + "learning_rate": 2.309675060058036e-06, + "loss": 0.3817, + "step": 5139 + }, + { + "epoch": 3.564493758668516, + "grad_norm": 0.4358658667147603, + "learning_rate": 2.307634953956948e-06, + "loss": 0.3935, + "step": 5140 + }, + { + "epoch": 3.5651872399445215, + "grad_norm": 0.39093448447171747, + "learning_rate": 2.3055954789267306e-06, + "loss": 0.3829, + "step": 5141 + }, + { + "epoch": 3.565880721220527, + "grad_norm": 0.5781377056891382, + "learning_rate": 2.3035566354454163e-06, + "loss": 0.3524, + "step": 5142 + }, + { + "epoch": 3.5665742024965326, + "grad_norm": 0.393615679603404, + "learning_rate": 2.301518423990899e-06, + "loss": 0.3897, + "step": 5143 + }, + { + "epoch": 3.5672676837725383, + "grad_norm": 0.4279025338548436, + "learning_rate": 2.299480845040921e-06, + "loss": 0.397, + "step": 5144 + }, + { + "epoch": 3.5679611650485437, + "grad_norm": 0.40230372264558606, + "learning_rate": 2.2974438990730734e-06, + "loss": 0.3984, + "step": 5145 + }, + { + "epoch": 3.568654646324549, + "grad_norm": 0.38299653364237374, + "learning_rate": 2.2954075865648027e-06, + "loss": 0.377, + "step": 5146 + }, + { + "epoch": 3.5693481276005548, + "grad_norm": 0.37389128567166785, + "learning_rate": 2.2933719079934064e-06, + "loss": 0.3493, + "step": 5147 + }, + { + "epoch": 3.5700416088765605, + "grad_norm": 0.38239529686987483, + "learning_rate": 2.291336863836032e-06, + "loss": 0.3627, + "step": 5148 + }, + { + "epoch": 3.570735090152566, + "grad_norm": 0.4229452525792137, + "learning_rate": 2.2893024545696822e-06, + "loss": 0.3427, + "step": 5149 + }, + { + "epoch": 3.571428571428571, + "grad_norm": 0.37419667452307964, + "learning_rate": 2.2872686806712037e-06, + "loss": 0.3416, + "step": 5150 + }, + { + "epoch": 3.572122052704577, + "grad_norm": 0.38368312027831164, + "learning_rate": 2.285235542617299e-06, + "loss": 0.4175, + "step": 5151 + }, + { + "epoch": 3.5728155339805827, + "grad_norm": 0.4458706093811005, + "learning_rate": 2.283203040884524e-06, + "loss": 0.4039, + "step": 5152 + }, + { + "epoch": 3.573509015256588, + "grad_norm": 0.4468861626451133, + "learning_rate": 2.2811711759492783e-06, + "loss": 0.3766, + "step": 5153 + }, + { + "epoch": 3.5742024965325934, + "grad_norm": 0.4091472015005601, + "learning_rate": 2.2791399482878184e-06, + "loss": 0.3722, + "step": 5154 + }, + { + "epoch": 3.574895977808599, + "grad_norm": 0.37328416454054747, + "learning_rate": 2.2771093583762517e-06, + "loss": 0.338, + "step": 5155 + }, + { + "epoch": 3.575589459084605, + "grad_norm": 0.37681455419911747, + "learning_rate": 2.2750794066905268e-06, + "loss": 0.3718, + "step": 5156 + }, + { + "epoch": 3.5762829403606102, + "grad_norm": 0.4031006990308686, + "learning_rate": 2.273050093706458e-06, + "loss": 0.3773, + "step": 5157 + }, + { + "epoch": 3.5769764216366156, + "grad_norm": 0.5014256774511401, + "learning_rate": 2.271021419899696e-06, + "loss": 0.3968, + "step": 5158 + }, + { + "epoch": 3.5776699029126213, + "grad_norm": 0.374929393250059, + "learning_rate": 2.2689933857457492e-06, + "loss": 0.3706, + "step": 5159 + }, + { + "epoch": 3.578363384188627, + "grad_norm": 0.39485848072786517, + "learning_rate": 2.2669659917199755e-06, + "loss": 0.3597, + "step": 5160 + }, + { + "epoch": 3.5790568654646324, + "grad_norm": 0.5046269872671587, + "learning_rate": 2.264939238297578e-06, + "loss": 0.3977, + "step": 5161 + }, + { + "epoch": 3.5797503467406377, + "grad_norm": 0.3932101896956029, + "learning_rate": 2.2629131259536147e-06, + "loss": 0.3884, + "step": 5162 + }, + { + "epoch": 3.5804438280166435, + "grad_norm": 0.8090460602154435, + "learning_rate": 2.2608876551629933e-06, + "loss": 0.3655, + "step": 5163 + }, + { + "epoch": 3.5811373092926493, + "grad_norm": 1.212908152197435, + "learning_rate": 2.2588628264004663e-06, + "loss": 0.3473, + "step": 5164 + }, + { + "epoch": 3.5818307905686546, + "grad_norm": 0.3903522565004271, + "learning_rate": 2.256838640140641e-06, + "loss": 0.4017, + "step": 5165 + }, + { + "epoch": 3.58252427184466, + "grad_norm": 0.3739733604519432, + "learning_rate": 2.2548150968579712e-06, + "loss": 0.373, + "step": 5166 + }, + { + "epoch": 3.5832177531206657, + "grad_norm": 0.3800201802447021, + "learning_rate": 2.2527921970267614e-06, + "loss": 0.3788, + "step": 5167 + }, + { + "epoch": 3.5839112343966715, + "grad_norm": 0.4120563299207978, + "learning_rate": 2.2507699411211658e-06, + "loss": 0.3403, + "step": 5168 + }, + { + "epoch": 3.584604715672677, + "grad_norm": 0.4957857029161015, + "learning_rate": 2.2487483296151836e-06, + "loss": 0.3921, + "step": 5169 + }, + { + "epoch": 3.585298196948682, + "grad_norm": 0.3930781419612483, + "learning_rate": 2.2467273629826674e-06, + "loss": 0.3803, + "step": 5170 + }, + { + "epoch": 3.585991678224688, + "grad_norm": 0.4114561838764046, + "learning_rate": 2.244707041697319e-06, + "loss": 0.4147, + "step": 5171 + }, + { + "epoch": 3.5866851595006937, + "grad_norm": 0.3704017677315467, + "learning_rate": 2.242687366232683e-06, + "loss": 0.313, + "step": 5172 + }, + { + "epoch": 3.587378640776699, + "grad_norm": 0.38027636666060016, + "learning_rate": 2.240668337062162e-06, + "loss": 0.4064, + "step": 5173 + }, + { + "epoch": 3.5880721220527043, + "grad_norm": 0.3975355553666112, + "learning_rate": 2.238649954658994e-06, + "loss": 0.3724, + "step": 5174 + }, + { + "epoch": 3.58876560332871, + "grad_norm": 0.4489736751899162, + "learning_rate": 2.2366322194962804e-06, + "loss": 0.3733, + "step": 5175 + }, + { + "epoch": 3.589459084604716, + "grad_norm": 0.4006245394676718, + "learning_rate": 2.234615132046962e-06, + "loss": 0.3374, + "step": 5176 + }, + { + "epoch": 3.590152565880721, + "grad_norm": 0.4117978416065849, + "learning_rate": 2.2325986927838286e-06, + "loss": 0.3791, + "step": 5177 + }, + { + "epoch": 3.5908460471567265, + "grad_norm": 0.43688854097432955, + "learning_rate": 2.23058290217952e-06, + "loss": 0.3162, + "step": 5178 + }, + { + "epoch": 3.5915395284327323, + "grad_norm": 0.38144373688509364, + "learning_rate": 2.2285677607065204e-06, + "loss": 0.3404, + "step": 5179 + }, + { + "epoch": 3.592233009708738, + "grad_norm": 0.38334174411156685, + "learning_rate": 2.226553268837166e-06, + "loss": 0.3931, + "step": 5180 + }, + { + "epoch": 3.5929264909847434, + "grad_norm": 0.865409573635709, + "learning_rate": 2.224539427043641e-06, + "loss": 0.3609, + "step": 5181 + }, + { + "epoch": 3.5936199722607487, + "grad_norm": 0.5822693629401792, + "learning_rate": 2.2225262357979714e-06, + "loss": 0.3555, + "step": 5182 + }, + { + "epoch": 3.5943134535367545, + "grad_norm": 0.43446102175494017, + "learning_rate": 2.2205136955720373e-06, + "loss": 0.4107, + "step": 5183 + }, + { + "epoch": 3.5950069348127602, + "grad_norm": 0.37732508892255723, + "learning_rate": 2.218501806837563e-06, + "loss": 0.3828, + "step": 5184 + }, + { + "epoch": 3.5957004160887656, + "grad_norm": 0.40385338731973575, + "learning_rate": 2.21649057006612e-06, + "loss": 0.3169, + "step": 5185 + }, + { + "epoch": 3.596393897364771, + "grad_norm": 0.4004502528288485, + "learning_rate": 2.2144799857291305e-06, + "loss": 0.3899, + "step": 5186 + }, + { + "epoch": 3.5970873786407767, + "grad_norm": 0.41078660164304187, + "learning_rate": 2.2124700542978566e-06, + "loss": 0.3769, + "step": 5187 + }, + { + "epoch": 3.5977808599167824, + "grad_norm": 0.4357819082768897, + "learning_rate": 2.210460776243414e-06, + "loss": 0.413, + "step": 5188 + }, + { + "epoch": 3.5984743411927878, + "grad_norm": 0.44925926321763543, + "learning_rate": 2.208452152036764e-06, + "loss": 0.3736, + "step": 5189 + }, + { + "epoch": 3.599167822468793, + "grad_norm": 0.3885083649446202, + "learning_rate": 2.2064441821487107e-06, + "loss": 0.385, + "step": 5190 + }, + { + "epoch": 3.599861303744799, + "grad_norm": 0.39902687081781524, + "learning_rate": 2.2044368670499093e-06, + "loss": 0.375, + "step": 5191 + }, + { + "epoch": 3.6005547850208046, + "grad_norm": 0.4080053013914103, + "learning_rate": 2.20243020721086e-06, + "loss": 0.3943, + "step": 5192 + }, + { + "epoch": 3.60124826629681, + "grad_norm": 0.403542067467956, + "learning_rate": 2.200424203101909e-06, + "loss": 0.4101, + "step": 5193 + }, + { + "epoch": 3.6019417475728153, + "grad_norm": 0.4155584753912239, + "learning_rate": 2.1984188551932513e-06, + "loss": 0.4447, + "step": 5194 + }, + { + "epoch": 3.602635228848821, + "grad_norm": 0.37401766239256234, + "learning_rate": 2.1964141639549217e-06, + "loss": 0.3474, + "step": 5195 + }, + { + "epoch": 3.603328710124827, + "grad_norm": 0.37833107582867914, + "learning_rate": 2.1944101298568076e-06, + "loss": 0.3278, + "step": 5196 + }, + { + "epoch": 3.604022191400832, + "grad_norm": 0.41213190344062667, + "learning_rate": 2.192406753368642e-06, + "loss": 0.3648, + "step": 5197 + }, + { + "epoch": 3.6047156726768375, + "grad_norm": 0.603651947577292, + "learning_rate": 2.1904040349599986e-06, + "loss": 0.3793, + "step": 5198 + }, + { + "epoch": 3.6054091539528432, + "grad_norm": 0.3918556428222181, + "learning_rate": 2.1884019751003003e-06, + "loss": 0.3799, + "step": 5199 + }, + { + "epoch": 3.606102635228849, + "grad_norm": 0.3712233854423895, + "learning_rate": 2.186400574258819e-06, + "loss": 0.3555, + "step": 5200 + }, + { + "epoch": 3.6067961165048543, + "grad_norm": 0.4253416479267744, + "learning_rate": 2.184399832904662e-06, + "loss": 0.4025, + "step": 5201 + }, + { + "epoch": 3.6074895977808596, + "grad_norm": 0.37654802991971503, + "learning_rate": 2.182399751506797e-06, + "loss": 0.3927, + "step": 5202 + }, + { + "epoch": 3.6081830790568654, + "grad_norm": 0.36641420344233405, + "learning_rate": 2.1804003305340217e-06, + "loss": 0.3237, + "step": 5203 + }, + { + "epoch": 3.608876560332871, + "grad_norm": 0.4157666780151188, + "learning_rate": 2.178401570454989e-06, + "loss": 0.3956, + "step": 5204 + }, + { + "epoch": 3.6095700416088765, + "grad_norm": 0.423908334019587, + "learning_rate": 2.1764034717381943e-06, + "loss": 0.393, + "step": 5205 + }, + { + "epoch": 3.610263522884882, + "grad_norm": 0.38338421385044325, + "learning_rate": 2.1744060348519753e-06, + "loss": 0.3736, + "step": 5206 + }, + { + "epoch": 3.6109570041608876, + "grad_norm": 0.471407961354846, + "learning_rate": 2.1724092602645177e-06, + "loss": 0.3792, + "step": 5207 + }, + { + "epoch": 3.6116504854368934, + "grad_norm": 0.4868353994286507, + "learning_rate": 2.1704131484438523e-06, + "loss": 0.4532, + "step": 5208 + }, + { + "epoch": 3.6123439667128987, + "grad_norm": 0.3861177034846141, + "learning_rate": 2.1684176998578506e-06, + "loss": 0.3391, + "step": 5209 + }, + { + "epoch": 3.613037447988904, + "grad_norm": 0.3927332943918114, + "learning_rate": 2.1664229149742328e-06, + "loss": 0.3354, + "step": 5210 + }, + { + "epoch": 3.61373092926491, + "grad_norm": 0.4096638230494183, + "learning_rate": 2.1644287942605618e-06, + "loss": 0.3739, + "step": 5211 + }, + { + "epoch": 3.6144244105409156, + "grad_norm": 0.4116400461050245, + "learning_rate": 2.1624353381842457e-06, + "loss": 0.3364, + "step": 5212 + }, + { + "epoch": 3.615117891816921, + "grad_norm": 0.7713549114821813, + "learning_rate": 2.1604425472125375e-06, + "loss": 0.3669, + "step": 5213 + }, + { + "epoch": 3.615811373092926, + "grad_norm": 0.38062243046752114, + "learning_rate": 2.1584504218125293e-06, + "loss": 0.3466, + "step": 5214 + }, + { + "epoch": 3.616504854368932, + "grad_norm": 0.39060587592483537, + "learning_rate": 2.156458962451164e-06, + "loss": 0.3289, + "step": 5215 + }, + { + "epoch": 3.6171983356449378, + "grad_norm": 0.42825069685408346, + "learning_rate": 2.154468169595223e-06, + "loss": 0.4044, + "step": 5216 + }, + { + "epoch": 3.617891816920943, + "grad_norm": 0.4182024432537755, + "learning_rate": 2.1524780437113343e-06, + "loss": 0.3812, + "step": 5217 + }, + { + "epoch": 3.6185852981969484, + "grad_norm": 0.3913450957068026, + "learning_rate": 2.1504885852659713e-06, + "loss": 0.418, + "step": 5218 + }, + { + "epoch": 3.619278779472954, + "grad_norm": 0.39355566557808314, + "learning_rate": 2.1484997947254432e-06, + "loss": 0.3725, + "step": 5219 + }, + { + "epoch": 3.61997226074896, + "grad_norm": 0.3865207872346973, + "learning_rate": 2.1465116725559133e-06, + "loss": 0.383, + "step": 5220 + }, + { + "epoch": 3.6206657420249653, + "grad_norm": 0.38882907571891734, + "learning_rate": 2.144524219223383e-06, + "loss": 0.3864, + "step": 5221 + }, + { + "epoch": 3.6213592233009706, + "grad_norm": 0.4207062110836621, + "learning_rate": 2.1425374351936946e-06, + "loss": 0.415, + "step": 5222 + }, + { + "epoch": 3.6220527045769764, + "grad_norm": 0.38788059027756255, + "learning_rate": 2.140551320932538e-06, + "loss": 0.3918, + "step": 5223 + }, + { + "epoch": 3.622746185852982, + "grad_norm": 0.39990187598462223, + "learning_rate": 2.1385658769054406e-06, + "loss": 0.4229, + "step": 5224 + }, + { + "epoch": 3.6234396671289875, + "grad_norm": 0.3915087703589566, + "learning_rate": 2.1365811035777783e-06, + "loss": 0.3641, + "step": 5225 + }, + { + "epoch": 3.624133148404993, + "grad_norm": 0.38639127097736065, + "learning_rate": 2.1345970014147695e-06, + "loss": 0.3469, + "step": 5226 + }, + { + "epoch": 3.6248266296809986, + "grad_norm": 0.38141827418371743, + "learning_rate": 2.1326135708814695e-06, + "loss": 0.3696, + "step": 5227 + }, + { + "epoch": 3.6255201109570043, + "grad_norm": 0.37335636095513836, + "learning_rate": 2.1306308124427818e-06, + "loss": 0.3885, + "step": 5228 + }, + { + "epoch": 3.6262135922330097, + "grad_norm": 0.37237848011590075, + "learning_rate": 2.1286487265634503e-06, + "loss": 0.365, + "step": 5229 + }, + { + "epoch": 3.6269070735090154, + "grad_norm": 0.40953904046546347, + "learning_rate": 2.126667313708062e-06, + "loss": 0.3883, + "step": 5230 + }, + { + "epoch": 3.6276005547850207, + "grad_norm": 0.36473035125010644, + "learning_rate": 2.1246865743410485e-06, + "loss": 0.3911, + "step": 5231 + }, + { + "epoch": 3.6282940360610265, + "grad_norm": 0.40849967577779733, + "learning_rate": 2.122706508926675e-06, + "loss": 0.3997, + "step": 5232 + }, + { + "epoch": 3.628987517337032, + "grad_norm": 0.3934298257638829, + "learning_rate": 2.1207271179290582e-06, + "loss": 0.4122, + "step": 5233 + }, + { + "epoch": 3.6296809986130376, + "grad_norm": 0.4458587643751225, + "learning_rate": 2.118748401812154e-06, + "loss": 0.3703, + "step": 5234 + }, + { + "epoch": 3.630374479889043, + "grad_norm": 0.35756973639921996, + "learning_rate": 2.1167703610397557e-06, + "loss": 0.3577, + "step": 5235 + }, + { + "epoch": 3.6310679611650487, + "grad_norm": 0.3823509901853653, + "learning_rate": 2.1147929960755033e-06, + "loss": 0.3832, + "step": 5236 + }, + { + "epoch": 3.631761442441054, + "grad_norm": 0.4289814467794912, + "learning_rate": 2.1128163073828766e-06, + "loss": 0.3614, + "step": 5237 + }, + { + "epoch": 3.63245492371706, + "grad_norm": 0.4005712963390427, + "learning_rate": 2.1108402954251978e-06, + "loss": 0.325, + "step": 5238 + }, + { + "epoch": 3.633148404993065, + "grad_norm": 0.4025266583207819, + "learning_rate": 2.1088649606656313e-06, + "loss": 0.4357, + "step": 5239 + }, + { + "epoch": 3.633841886269071, + "grad_norm": 0.4057167364696131, + "learning_rate": 2.1068903035671777e-06, + "loss": 0.3378, + "step": 5240 + }, + { + "epoch": 3.6345353675450762, + "grad_norm": 0.40736315320025795, + "learning_rate": 2.104916324592684e-06, + "loss": 0.3879, + "step": 5241 + }, + { + "epoch": 3.635228848821082, + "grad_norm": 0.4039867441121231, + "learning_rate": 2.102943024204838e-06, + "loss": 0.3768, + "step": 5242 + }, + { + "epoch": 3.6359223300970873, + "grad_norm": 0.39300261998214253, + "learning_rate": 2.1009704028661643e-06, + "loss": 0.38, + "step": 5243 + }, + { + "epoch": 3.636615811373093, + "grad_norm": 0.40640668117504203, + "learning_rate": 2.0989984610390325e-06, + "loss": 0.3656, + "step": 5244 + }, + { + "epoch": 3.6373092926490984, + "grad_norm": 0.4222430820699589, + "learning_rate": 2.097027199185653e-06, + "loss": 0.3962, + "step": 5245 + }, + { + "epoch": 3.638002773925104, + "grad_norm": 0.42119368631648546, + "learning_rate": 2.0950566177680706e-06, + "loss": 0.3717, + "step": 5246 + }, + { + "epoch": 3.6386962552011095, + "grad_norm": 0.4265118119973986, + "learning_rate": 2.093086717248181e-06, + "loss": 0.3809, + "step": 5247 + }, + { + "epoch": 3.6393897364771153, + "grad_norm": 0.40525527616582613, + "learning_rate": 2.0911174980877106e-06, + "loss": 0.3448, + "step": 5248 + }, + { + "epoch": 3.6400832177531206, + "grad_norm": 0.4194810086531931, + "learning_rate": 2.0891489607482322e-06, + "loss": 0.3511, + "step": 5249 + }, + { + "epoch": 3.6407766990291264, + "grad_norm": 0.37969005824143925, + "learning_rate": 2.0871811056911574e-06, + "loss": 0.4411, + "step": 5250 + }, + { + "epoch": 3.6414701803051317, + "grad_norm": 0.40178574873091283, + "learning_rate": 2.085213933377734e-06, + "loss": 0.3827, + "step": 5251 + }, + { + "epoch": 3.6421636615811375, + "grad_norm": 0.4367908327295253, + "learning_rate": 2.083247444269055e-06, + "loss": 0.3677, + "step": 5252 + }, + { + "epoch": 3.642857142857143, + "grad_norm": 0.39093543065160924, + "learning_rate": 2.081281638826052e-06, + "loss": 0.3937, + "step": 5253 + }, + { + "epoch": 3.6435506241331486, + "grad_norm": 0.4135113531118907, + "learning_rate": 2.079316517509493e-06, + "loss": 0.4199, + "step": 5254 + }, + { + "epoch": 3.644244105409154, + "grad_norm": 0.41908321135583315, + "learning_rate": 2.0773520807799903e-06, + "loss": 0.3686, + "step": 5255 + }, + { + "epoch": 3.6449375866851597, + "grad_norm": 0.38419745499152985, + "learning_rate": 2.075388329097992e-06, + "loss": 0.3587, + "step": 5256 + }, + { + "epoch": 3.645631067961165, + "grad_norm": 0.3874963260170016, + "learning_rate": 2.0734252629237892e-06, + "loss": 0.3582, + "step": 5257 + }, + { + "epoch": 3.6463245492371708, + "grad_norm": 0.35940525524850075, + "learning_rate": 2.071462882717511e-06, + "loss": 0.3759, + "step": 5258 + }, + { + "epoch": 3.647018030513176, + "grad_norm": 0.3764549928368571, + "learning_rate": 2.069501188939122e-06, + "loss": 0.4275, + "step": 5259 + }, + { + "epoch": 3.647711511789182, + "grad_norm": 0.3711125453577655, + "learning_rate": 2.0675401820484325e-06, + "loss": 0.3717, + "step": 5260 + }, + { + "epoch": 3.648404993065187, + "grad_norm": 0.37295534035883965, + "learning_rate": 2.0655798625050842e-06, + "loss": 0.3534, + "step": 5261 + }, + { + "epoch": 3.649098474341193, + "grad_norm": 0.4324254663511878, + "learning_rate": 2.063620230768564e-06, + "loss": 0.3809, + "step": 5262 + }, + { + "epoch": 3.6497919556171983, + "grad_norm": 0.4219545422880887, + "learning_rate": 2.061661287298198e-06, + "loss": 0.3908, + "step": 5263 + }, + { + "epoch": 3.650485436893204, + "grad_norm": 0.40626037071603777, + "learning_rate": 2.059703032553142e-06, + "loss": 0.3552, + "step": 5264 + }, + { + "epoch": 3.6511789181692094, + "grad_norm": 0.3970658629986061, + "learning_rate": 2.057745466992404e-06, + "loss": 0.3668, + "step": 5265 + }, + { + "epoch": 3.651872399445215, + "grad_norm": 0.3590732341243277, + "learning_rate": 2.0557885910748177e-06, + "loss": 0.3613, + "step": 5266 + }, + { + "epoch": 3.6525658807212205, + "grad_norm": 0.4029705835190139, + "learning_rate": 2.053832405259063e-06, + "loss": 0.3718, + "step": 5267 + }, + { + "epoch": 3.6532593619972262, + "grad_norm": 0.4336394056599201, + "learning_rate": 2.0518769100036567e-06, + "loss": 0.38, + "step": 5268 + }, + { + "epoch": 3.6539528432732316, + "grad_norm": 0.4114631371488936, + "learning_rate": 2.04992210576695e-06, + "loss": 0.3768, + "step": 5269 + }, + { + "epoch": 3.6546463245492373, + "grad_norm": 0.3968008258109872, + "learning_rate": 2.0479679930071362e-06, + "loss": 0.3704, + "step": 5270 + }, + { + "epoch": 3.6553398058252426, + "grad_norm": 0.4342862056399891, + "learning_rate": 2.0460145721822467e-06, + "loss": 0.4137, + "step": 5271 + }, + { + "epoch": 3.6560332871012484, + "grad_norm": 0.38812134703640805, + "learning_rate": 2.0440618437501466e-06, + "loss": 0.3615, + "step": 5272 + }, + { + "epoch": 3.6567267683772537, + "grad_norm": 0.38389853571050464, + "learning_rate": 2.042109808168542e-06, + "loss": 0.3204, + "step": 5273 + }, + { + "epoch": 3.6574202496532595, + "grad_norm": 0.4165238531784227, + "learning_rate": 2.040158465894976e-06, + "loss": 0.414, + "step": 5274 + }, + { + "epoch": 3.658113730929265, + "grad_norm": 0.4009628554735647, + "learning_rate": 2.03820781738683e-06, + "loss": 0.4092, + "step": 5275 + }, + { + "epoch": 3.6588072122052706, + "grad_norm": 0.40852166635369597, + "learning_rate": 2.0362578631013225e-06, + "loss": 0.3599, + "step": 5276 + }, + { + "epoch": 3.659500693481276, + "grad_norm": 0.746066717427677, + "learning_rate": 2.0343086034955064e-06, + "loss": 0.3649, + "step": 5277 + }, + { + "epoch": 3.6601941747572817, + "grad_norm": 0.5389555368446969, + "learning_rate": 2.0323600390262743e-06, + "loss": 0.3518, + "step": 5278 + }, + { + "epoch": 3.660887656033287, + "grad_norm": 0.39812197559818674, + "learning_rate": 2.030412170150359e-06, + "loss": 0.3504, + "step": 5279 + }, + { + "epoch": 3.661581137309293, + "grad_norm": 0.6868818943003177, + "learning_rate": 2.0284649973243214e-06, + "loss": 0.3647, + "step": 5280 + }, + { + "epoch": 3.662274618585298, + "grad_norm": 0.4448039060870248, + "learning_rate": 2.0265185210045686e-06, + "loss": 0.3553, + "step": 5281 + }, + { + "epoch": 3.662968099861304, + "grad_norm": 0.46477284084904175, + "learning_rate": 2.0245727416473388e-06, + "loss": 0.3757, + "step": 5282 + }, + { + "epoch": 3.663661581137309, + "grad_norm": 0.3952221652676267, + "learning_rate": 2.0226276597087095e-06, + "loss": 0.3398, + "step": 5283 + }, + { + "epoch": 3.664355062413315, + "grad_norm": 0.44734737821828086, + "learning_rate": 2.0206832756445954e-06, + "loss": 0.4001, + "step": 5284 + }, + { + "epoch": 3.6650485436893203, + "grad_norm": 0.3992605563509741, + "learning_rate": 2.0187395899107427e-06, + "loss": 0.4067, + "step": 5285 + }, + { + "epoch": 3.665742024965326, + "grad_norm": 0.40523073495924555, + "learning_rate": 2.016796602962739e-06, + "loss": 0.3697, + "step": 5286 + }, + { + "epoch": 3.6664355062413314, + "grad_norm": 0.42897580319741746, + "learning_rate": 2.014854315256007e-06, + "loss": 0.4396, + "step": 5287 + }, + { + "epoch": 3.667128987517337, + "grad_norm": 0.37582277128821234, + "learning_rate": 2.0129127272458034e-06, + "loss": 0.3518, + "step": 5288 + }, + { + "epoch": 3.6678224687933425, + "grad_norm": 0.3830032308147283, + "learning_rate": 2.0109718393872223e-06, + "loss": 0.3998, + "step": 5289 + }, + { + "epoch": 3.6685159500693483, + "grad_norm": 0.3954512366682606, + "learning_rate": 2.0090316521351973e-06, + "loss": 0.3968, + "step": 5290 + }, + { + "epoch": 3.6692094313453536, + "grad_norm": 0.3980477177049583, + "learning_rate": 2.007092165944487e-06, + "loss": 0.391, + "step": 5291 + }, + { + "epoch": 3.6699029126213594, + "grad_norm": 0.41782975999023964, + "learning_rate": 2.005153381269701e-06, + "loss": 0.3869, + "step": 5292 + }, + { + "epoch": 3.6705963938973647, + "grad_norm": 0.42948888554482567, + "learning_rate": 2.0032152985652708e-06, + "loss": 0.4096, + "step": 5293 + }, + { + "epoch": 3.6712898751733705, + "grad_norm": 0.43441263350808534, + "learning_rate": 2.001277918285471e-06, + "loss": 0.442, + "step": 5294 + }, + { + "epoch": 3.671983356449376, + "grad_norm": 0.3857274273216944, + "learning_rate": 1.9993412408844114e-06, + "loss": 0.3885, + "step": 5295 + }, + { + "epoch": 3.6726768377253816, + "grad_norm": 0.38980671828455116, + "learning_rate": 1.997405266816031e-06, + "loss": 0.3614, + "step": 5296 + }, + { + "epoch": 3.673370319001387, + "grad_norm": 0.41422107066383607, + "learning_rate": 1.995469996534111e-06, + "loss": 0.3901, + "step": 5297 + }, + { + "epoch": 3.6740638002773927, + "grad_norm": 0.3923980975330044, + "learning_rate": 1.993535430492265e-06, + "loss": 0.3463, + "step": 5298 + }, + { + "epoch": 3.674757281553398, + "grad_norm": 0.3807959062075781, + "learning_rate": 1.991601569143938e-06, + "loss": 0.3461, + "step": 5299 + }, + { + "epoch": 3.6754507628294038, + "grad_norm": 0.3974929649988162, + "learning_rate": 1.9896684129424164e-06, + "loss": 0.4339, + "step": 5300 + }, + { + "epoch": 3.676144244105409, + "grad_norm": 0.39478911535342665, + "learning_rate": 1.9877359623408167e-06, + "loss": 0.41, + "step": 5301 + }, + { + "epoch": 3.676837725381415, + "grad_norm": 0.418901214268293, + "learning_rate": 1.9858042177920915e-06, + "loss": 0.3946, + "step": 5302 + }, + { + "epoch": 3.67753120665742, + "grad_norm": 0.38307831985414426, + "learning_rate": 1.9838731797490295e-06, + "loss": 0.3753, + "step": 5303 + }, + { + "epoch": 3.678224687933426, + "grad_norm": 0.408751199189437, + "learning_rate": 1.9819428486642488e-06, + "loss": 0.4058, + "step": 5304 + }, + { + "epoch": 3.6789181692094313, + "grad_norm": 0.41520334290650174, + "learning_rate": 1.9800132249902084e-06, + "loss": 0.4078, + "step": 5305 + }, + { + "epoch": 3.679611650485437, + "grad_norm": 0.3911739104132286, + "learning_rate": 1.978084309179194e-06, + "loss": 0.3555, + "step": 5306 + }, + { + "epoch": 3.6803051317614424, + "grad_norm": 0.43896781884169084, + "learning_rate": 1.976156101683332e-06, + "loss": 0.4057, + "step": 5307 + }, + { + "epoch": 3.680998613037448, + "grad_norm": 0.4334563690105161, + "learning_rate": 1.9742286029545823e-06, + "loss": 0.3928, + "step": 5308 + }, + { + "epoch": 3.6816920943134535, + "grad_norm": 0.40411046688184127, + "learning_rate": 1.9723018134447303e-06, + "loss": 0.3775, + "step": 5309 + }, + { + "epoch": 3.6823855755894592, + "grad_norm": 0.4290264322409643, + "learning_rate": 1.970375733605409e-06, + "loss": 0.3756, + "step": 5310 + }, + { + "epoch": 3.6830790568654646, + "grad_norm": 0.34689918706352113, + "learning_rate": 1.968450363888073e-06, + "loss": 0.323, + "step": 5311 + }, + { + "epoch": 3.6837725381414703, + "grad_norm": 0.3852458491943254, + "learning_rate": 1.966525704744016e-06, + "loss": 0.3956, + "step": 5312 + }, + { + "epoch": 3.6844660194174756, + "grad_norm": 0.5183324471904935, + "learning_rate": 1.964601756624366e-06, + "loss": 0.375, + "step": 5313 + }, + { + "epoch": 3.6851595006934814, + "grad_norm": 0.42972473805123057, + "learning_rate": 1.962678519980079e-06, + "loss": 0.4329, + "step": 5314 + }, + { + "epoch": 3.6858529819694867, + "grad_norm": 0.38771107037895286, + "learning_rate": 1.9607559952619497e-06, + "loss": 0.3433, + "step": 5315 + }, + { + "epoch": 3.6865464632454925, + "grad_norm": 0.4479177429977054, + "learning_rate": 1.9588341829206057e-06, + "loss": 0.3611, + "step": 5316 + }, + { + "epoch": 3.687239944521498, + "grad_norm": 0.4042270457950642, + "learning_rate": 1.9569130834065025e-06, + "loss": 0.3614, + "step": 5317 + }, + { + "epoch": 3.6879334257975036, + "grad_norm": 0.44695599347825055, + "learning_rate": 1.9549926971699334e-06, + "loss": 0.4132, + "step": 5318 + }, + { + "epoch": 3.688626907073509, + "grad_norm": 0.3745917664545771, + "learning_rate": 1.953073024661023e-06, + "loss": 0.3743, + "step": 5319 + }, + { + "epoch": 3.6893203883495147, + "grad_norm": 0.40351364311898186, + "learning_rate": 1.9511540663297284e-06, + "loss": 0.3476, + "step": 5320 + }, + { + "epoch": 3.69001386962552, + "grad_norm": 0.4441667258291502, + "learning_rate": 1.949235822625842e-06, + "loss": 0.3439, + "step": 5321 + }, + { + "epoch": 3.690707350901526, + "grad_norm": 0.3978070659280194, + "learning_rate": 1.9473182939989828e-06, + "loss": 0.46, + "step": 5322 + }, + { + "epoch": 3.691400832177531, + "grad_norm": 0.6674323596632495, + "learning_rate": 1.945401480898606e-06, + "loss": 0.3639, + "step": 5323 + }, + { + "epoch": 3.692094313453537, + "grad_norm": 0.42511417504718546, + "learning_rate": 1.943485383774002e-06, + "loss": 0.3712, + "step": 5324 + }, + { + "epoch": 3.692787794729542, + "grad_norm": 0.45108591244684154, + "learning_rate": 1.9415700030742855e-06, + "loss": 0.3685, + "step": 5325 + }, + { + "epoch": 3.693481276005548, + "grad_norm": 0.37479248618189814, + "learning_rate": 1.9396553392484108e-06, + "loss": 0.3845, + "step": 5326 + }, + { + "epoch": 3.6941747572815533, + "grad_norm": 0.41130250834242243, + "learning_rate": 1.9377413927451598e-06, + "loss": 0.3206, + "step": 5327 + }, + { + "epoch": 3.694868238557559, + "grad_norm": 0.3879707868506853, + "learning_rate": 1.9358281640131488e-06, + "loss": 0.3983, + "step": 5328 + }, + { + "epoch": 3.6955617198335644, + "grad_norm": 0.40385556484309626, + "learning_rate": 1.933915653500826e-06, + "loss": 0.3823, + "step": 5329 + }, + { + "epoch": 3.69625520110957, + "grad_norm": 0.39556605551863505, + "learning_rate": 1.932003861656467e-06, + "loss": 0.3723, + "step": 5330 + }, + { + "epoch": 3.6969486823855755, + "grad_norm": 0.3764209278128449, + "learning_rate": 1.930092788928183e-06, + "loss": 0.3707, + "step": 5331 + }, + { + "epoch": 3.6976421636615813, + "grad_norm": 0.38952718475498654, + "learning_rate": 1.9281824357639178e-06, + "loss": 0.361, + "step": 5332 + }, + { + "epoch": 3.6983356449375866, + "grad_norm": 0.40343682984359014, + "learning_rate": 1.926272802611441e-06, + "loss": 0.3408, + "step": 5333 + }, + { + "epoch": 3.6990291262135924, + "grad_norm": 1.1020813762226875, + "learning_rate": 1.9243638899183577e-06, + "loss": 0.3693, + "step": 5334 + }, + { + "epoch": 3.6997226074895977, + "grad_norm": 0.4520957815713009, + "learning_rate": 1.922455698132104e-06, + "loss": 0.3831, + "step": 5335 + }, + { + "epoch": 3.7004160887656035, + "grad_norm": 0.3865871728402252, + "learning_rate": 1.920548227699946e-06, + "loss": 0.3866, + "step": 5336 + }, + { + "epoch": 3.701109570041609, + "grad_norm": 0.3999319677398349, + "learning_rate": 1.918641479068983e-06, + "loss": 0.3176, + "step": 5337 + }, + { + "epoch": 3.7018030513176146, + "grad_norm": 0.3954960005167473, + "learning_rate": 1.916735452686139e-06, + "loss": 0.4194, + "step": 5338 + }, + { + "epoch": 3.70249653259362, + "grad_norm": 0.39470818701966953, + "learning_rate": 1.9148301489981753e-06, + "loss": 0.3721, + "step": 5339 + }, + { + "epoch": 3.7031900138696257, + "grad_norm": 0.410901626788164, + "learning_rate": 1.9129255684516824e-06, + "loss": 0.3701, + "step": 5340 + }, + { + "epoch": 3.703883495145631, + "grad_norm": 0.4030037016470731, + "learning_rate": 1.9110217114930766e-06, + "loss": 0.3949, + "step": 5341 + }, + { + "epoch": 3.7045769764216367, + "grad_norm": 0.37219093581630314, + "learning_rate": 1.9091185785686106e-06, + "loss": 0.3626, + "step": 5342 + }, + { + "epoch": 3.705270457697642, + "grad_norm": 0.3826855962147528, + "learning_rate": 1.9072161701243664e-06, + "loss": 0.3355, + "step": 5343 + }, + { + "epoch": 3.705963938973648, + "grad_norm": 0.3872617031204001, + "learning_rate": 1.905314486606249e-06, + "loss": 0.3948, + "step": 5344 + }, + { + "epoch": 3.706657420249653, + "grad_norm": 0.37607961761091796, + "learning_rate": 1.9034135284600064e-06, + "loss": 0.408, + "step": 5345 + }, + { + "epoch": 3.707350901525659, + "grad_norm": 0.541932364786649, + "learning_rate": 1.9015132961312049e-06, + "loss": 0.3459, + "step": 5346 + }, + { + "epoch": 3.7080443828016643, + "grad_norm": 1.102306237579518, + "learning_rate": 1.8996137900652468e-06, + "loss": 0.3954, + "step": 5347 + }, + { + "epoch": 3.70873786407767, + "grad_norm": 0.4654186258181047, + "learning_rate": 1.8977150107073632e-06, + "loss": 0.3868, + "step": 5348 + }, + { + "epoch": 3.7094313453536754, + "grad_norm": 0.38346577307458257, + "learning_rate": 1.895816958502612e-06, + "loss": 0.3314, + "step": 5349 + }, + { + "epoch": 3.710124826629681, + "grad_norm": 0.3998605753085461, + "learning_rate": 1.893919633895886e-06, + "loss": 0.4209, + "step": 5350 + }, + { + "epoch": 3.7108183079056865, + "grad_norm": 0.44792728653393216, + "learning_rate": 1.892023037331901e-06, + "loss": 0.4073, + "step": 5351 + }, + { + "epoch": 3.7115117891816922, + "grad_norm": 0.3732161102516748, + "learning_rate": 1.8901271692552065e-06, + "loss": 0.3595, + "step": 5352 + }, + { + "epoch": 3.7122052704576975, + "grad_norm": 0.4245500412639838, + "learning_rate": 1.888232030110181e-06, + "loss": 0.3955, + "step": 5353 + }, + { + "epoch": 3.7128987517337033, + "grad_norm": 0.42570444772569693, + "learning_rate": 1.886337620341031e-06, + "loss": 0.4127, + "step": 5354 + }, + { + "epoch": 3.7135922330097086, + "grad_norm": 0.38576977020656966, + "learning_rate": 1.8844439403917947e-06, + "loss": 0.3632, + "step": 5355 + }, + { + "epoch": 3.7142857142857144, + "grad_norm": 0.43437060498772284, + "learning_rate": 1.8825509907063328e-06, + "loss": 0.4052, + "step": 5356 + }, + { + "epoch": 3.7149791955617197, + "grad_norm": 0.4439072161217333, + "learning_rate": 1.8806587717283415e-06, + "loss": 0.3832, + "step": 5357 + }, + { + "epoch": 3.7156726768377255, + "grad_norm": 0.39488729614130524, + "learning_rate": 1.8787672839013438e-06, + "loss": 0.4106, + "step": 5358 + }, + { + "epoch": 3.716366158113731, + "grad_norm": 0.5259586927761669, + "learning_rate": 1.8768765276686885e-06, + "loss": 0.3571, + "step": 5359 + }, + { + "epoch": 3.7170596393897366, + "grad_norm": 0.5127617716108047, + "learning_rate": 1.874986503473556e-06, + "loss": 0.3734, + "step": 5360 + }, + { + "epoch": 3.717753120665742, + "grad_norm": 0.41067658295379306, + "learning_rate": 1.8730972117589568e-06, + "loss": 0.4116, + "step": 5361 + }, + { + "epoch": 3.7184466019417477, + "grad_norm": 0.4330698754437948, + "learning_rate": 1.8712086529677214e-06, + "loss": 0.379, + "step": 5362 + }, + { + "epoch": 3.719140083217753, + "grad_norm": 0.4855578889819878, + "learning_rate": 1.8693208275425217e-06, + "loss": 0.3608, + "step": 5363 + }, + { + "epoch": 3.719833564493759, + "grad_norm": 0.3893809211511023, + "learning_rate": 1.8674337359258443e-06, + "loss": 0.3511, + "step": 5364 + }, + { + "epoch": 3.720527045769764, + "grad_norm": 0.42694298428814276, + "learning_rate": 1.8655473785600125e-06, + "loss": 0.3513, + "step": 5365 + }, + { + "epoch": 3.72122052704577, + "grad_norm": 0.4060773284838115, + "learning_rate": 1.863661755887176e-06, + "loss": 0.3863, + "step": 5366 + }, + { + "epoch": 3.721914008321775, + "grad_norm": 0.3988787844042086, + "learning_rate": 1.8617768683493082e-06, + "loss": 0.3802, + "step": 5367 + }, + { + "epoch": 3.722607489597781, + "grad_norm": 0.39907097823078685, + "learning_rate": 1.8598927163882136e-06, + "loss": 0.3977, + "step": 5368 + }, + { + "epoch": 3.7233009708737863, + "grad_norm": 0.37537444469144365, + "learning_rate": 1.858009300445527e-06, + "loss": 0.3428, + "step": 5369 + }, + { + "epoch": 3.723994452149792, + "grad_norm": 0.4179301049596186, + "learning_rate": 1.8561266209627026e-06, + "loss": 0.3368, + "step": 5370 + }, + { + "epoch": 3.7246879334257974, + "grad_norm": 0.38146528935992374, + "learning_rate": 1.8542446783810298e-06, + "loss": 0.3806, + "step": 5371 + }, + { + "epoch": 3.725381414701803, + "grad_norm": 0.3829881979556981, + "learning_rate": 1.8523634731416218e-06, + "loss": 0.3431, + "step": 5372 + }, + { + "epoch": 3.7260748959778085, + "grad_norm": 0.4090834194179888, + "learning_rate": 1.850483005685419e-06, + "loss": 0.334, + "step": 5373 + }, + { + "epoch": 3.7267683772538143, + "grad_norm": 0.3696294472764767, + "learning_rate": 1.8486032764531918e-06, + "loss": 0.3432, + "step": 5374 + }, + { + "epoch": 3.7274618585298196, + "grad_norm": 0.39134696174347977, + "learning_rate": 1.8467242858855312e-06, + "loss": 0.3474, + "step": 5375 + }, + { + "epoch": 3.7281553398058254, + "grad_norm": 0.42424056109601543, + "learning_rate": 1.8448460344228609e-06, + "loss": 0.3586, + "step": 5376 + }, + { + "epoch": 3.7288488210818307, + "grad_norm": 0.3806350558278461, + "learning_rate": 1.842968522505431e-06, + "loss": 0.379, + "step": 5377 + }, + { + "epoch": 3.7295423023578365, + "grad_norm": 0.37868728143788455, + "learning_rate": 1.841091750573314e-06, + "loss": 0.361, + "step": 5378 + }, + { + "epoch": 3.730235783633842, + "grad_norm": 0.39072976839261975, + "learning_rate": 1.8392157190664123e-06, + "loss": 0.3391, + "step": 5379 + }, + { + "epoch": 3.7309292649098476, + "grad_norm": 0.3981970209721558, + "learning_rate": 1.837340428424455e-06, + "loss": 0.3593, + "step": 5380 + }, + { + "epoch": 3.731622746185853, + "grad_norm": 0.42051350946993543, + "learning_rate": 1.8354658790869956e-06, + "loss": 0.413, + "step": 5381 + }, + { + "epoch": 3.7323162274618586, + "grad_norm": 0.4117186830406726, + "learning_rate": 1.833592071493418e-06, + "loss": 0.332, + "step": 5382 + }, + { + "epoch": 3.733009708737864, + "grad_norm": 0.38356762025880703, + "learning_rate": 1.8317190060829242e-06, + "loss": 0.3496, + "step": 5383 + }, + { + "epoch": 3.7337031900138697, + "grad_norm": 0.41540321153530235, + "learning_rate": 1.8298466832945499e-06, + "loss": 0.396, + "step": 5384 + }, + { + "epoch": 3.734396671289875, + "grad_norm": 0.4100911525683332, + "learning_rate": 1.8279751035671556e-06, + "loss": 0.4158, + "step": 5385 + }, + { + "epoch": 3.735090152565881, + "grad_norm": 0.4076169553299872, + "learning_rate": 1.8261042673394219e-06, + "loss": 0.3619, + "step": 5386 + }, + { + "epoch": 3.735783633841886, + "grad_norm": 0.4750286575047852, + "learning_rate": 1.8242341750498638e-06, + "loss": 0.3954, + "step": 5387 + }, + { + "epoch": 3.736477115117892, + "grad_norm": 0.4507221022178403, + "learning_rate": 1.8223648271368133e-06, + "loss": 0.408, + "step": 5388 + }, + { + "epoch": 3.7371705963938973, + "grad_norm": 0.37142770619311544, + "learning_rate": 1.8204962240384316e-06, + "loss": 0.3777, + "step": 5389 + }, + { + "epoch": 3.737864077669903, + "grad_norm": 0.3827900357951355, + "learning_rate": 1.8186283661927117e-06, + "loss": 0.401, + "step": 5390 + }, + { + "epoch": 3.7385575589459084, + "grad_norm": 0.4436942607606533, + "learning_rate": 1.8167612540374606e-06, + "loss": 0.3814, + "step": 5391 + }, + { + "epoch": 3.739251040221914, + "grad_norm": 0.3974637766807117, + "learning_rate": 1.8148948880103174e-06, + "loss": 0.3768, + "step": 5392 + }, + { + "epoch": 3.7399445214979194, + "grad_norm": 0.4554658431213549, + "learning_rate": 1.8130292685487466e-06, + "loss": 0.3855, + "step": 5393 + }, + { + "epoch": 3.740638002773925, + "grad_norm": 0.41099824440573857, + "learning_rate": 1.8111643960900321e-06, + "loss": 0.3534, + "step": 5394 + }, + { + "epoch": 3.7413314840499305, + "grad_norm": 0.43434492828653515, + "learning_rate": 1.8093002710712903e-06, + "loss": 0.3554, + "step": 5395 + }, + { + "epoch": 3.7420249653259363, + "grad_norm": 0.3632681244633984, + "learning_rate": 1.8074368939294555e-06, + "loss": 0.3315, + "step": 5396 + }, + { + "epoch": 3.7427184466019416, + "grad_norm": 0.39875603610965515, + "learning_rate": 1.8055742651012908e-06, + "loss": 0.4208, + "step": 5397 + }, + { + "epoch": 3.7434119278779474, + "grad_norm": 0.44358660190842997, + "learning_rate": 1.8037123850233833e-06, + "loss": 0.3694, + "step": 5398 + }, + { + "epoch": 3.7441054091539527, + "grad_norm": 0.3672623223520126, + "learning_rate": 1.8018512541321442e-06, + "loss": 0.3816, + "step": 5399 + }, + { + "epoch": 3.7447988904299585, + "grad_norm": 0.41301627423255255, + "learning_rate": 1.7999908728638104e-06, + "loss": 0.406, + "step": 5400 + }, + { + "epoch": 3.745492371705964, + "grad_norm": 0.3864285252967822, + "learning_rate": 1.7981312416544394e-06, + "loss": 0.3498, + "step": 5401 + }, + { + "epoch": 3.7461858529819696, + "grad_norm": 0.379854908989512, + "learning_rate": 1.7962723609399158e-06, + "loss": 0.3751, + "step": 5402 + }, + { + "epoch": 3.746879334257975, + "grad_norm": 0.3810957492909113, + "learning_rate": 1.7944142311559504e-06, + "loss": 0.3735, + "step": 5403 + }, + { + "epoch": 3.7475728155339807, + "grad_norm": 0.4075746678188746, + "learning_rate": 1.7925568527380717e-06, + "loss": 0.3557, + "step": 5404 + }, + { + "epoch": 3.748266296809986, + "grad_norm": 0.41268868819624693, + "learning_rate": 1.7907002261216367e-06, + "loss": 0.41, + "step": 5405 + }, + { + "epoch": 3.748959778085992, + "grad_norm": 0.4291613824698058, + "learning_rate": 1.788844351741828e-06, + "loss": 0.3818, + "step": 5406 + }, + { + "epoch": 3.749653259361997, + "grad_norm": 0.42162892709676997, + "learning_rate": 1.7869892300336434e-06, + "loss": 0.3789, + "step": 5407 + }, + { + "epoch": 3.750346740638003, + "grad_norm": 0.41320771168784265, + "learning_rate": 1.785134861431917e-06, + "loss": 0.3524, + "step": 5408 + }, + { + "epoch": 3.751040221914008, + "grad_norm": 0.3823140088359749, + "learning_rate": 1.783281246371294e-06, + "loss": 0.3471, + "step": 5409 + }, + { + "epoch": 3.751733703190014, + "grad_norm": 0.5566639626393827, + "learning_rate": 1.7814283852862507e-06, + "loss": 0.3626, + "step": 5410 + }, + { + "epoch": 3.7524271844660193, + "grad_norm": 0.5675627780797136, + "learning_rate": 1.7795762786110854e-06, + "loss": 0.3395, + "step": 5411 + }, + { + "epoch": 3.753120665742025, + "grad_norm": 0.4004817695691878, + "learning_rate": 1.777724926779915e-06, + "loss": 0.4007, + "step": 5412 + }, + { + "epoch": 3.7538141470180304, + "grad_norm": 0.37930504637433377, + "learning_rate": 1.7758743302266856e-06, + "loss": 0.3445, + "step": 5413 + }, + { + "epoch": 3.754507628294036, + "grad_norm": 0.7074186613546151, + "learning_rate": 1.7740244893851644e-06, + "loss": 0.3911, + "step": 5414 + }, + { + "epoch": 3.7552011095700415, + "grad_norm": 0.3891171357387995, + "learning_rate": 1.7721754046889373e-06, + "loss": 0.3487, + "step": 5415 + }, + { + "epoch": 3.7558945908460473, + "grad_norm": 0.6629439605906323, + "learning_rate": 1.7703270765714186e-06, + "loss": 0.3923, + "step": 5416 + }, + { + "epoch": 3.7565880721220526, + "grad_norm": 0.39184537825893406, + "learning_rate": 1.7684795054658427e-06, + "loss": 0.3557, + "step": 5417 + }, + { + "epoch": 3.7572815533980584, + "grad_norm": 0.42051575149751486, + "learning_rate": 1.7666326918052667e-06, + "loss": 0.3856, + "step": 5418 + }, + { + "epoch": 3.7579750346740637, + "grad_norm": 0.41142882284450016, + "learning_rate": 1.764786636022573e-06, + "loss": 0.3997, + "step": 5419 + }, + { + "epoch": 3.7586685159500695, + "grad_norm": 0.3811422041346554, + "learning_rate": 1.762941338550459e-06, + "loss": 0.3855, + "step": 5420 + }, + { + "epoch": 3.759361997226075, + "grad_norm": 0.4180845691643529, + "learning_rate": 1.7610967998214518e-06, + "loss": 0.4111, + "step": 5421 + }, + { + "epoch": 3.7600554785020806, + "grad_norm": 0.38518282689142475, + "learning_rate": 1.7592530202678986e-06, + "loss": 0.3812, + "step": 5422 + }, + { + "epoch": 3.760748959778086, + "grad_norm": 0.3768274803511191, + "learning_rate": 1.7574100003219657e-06, + "loss": 0.3326, + "step": 5423 + }, + { + "epoch": 3.7614424410540916, + "grad_norm": 0.405186290224963, + "learning_rate": 1.7555677404156446e-06, + "loss": 0.4337, + "step": 5424 + }, + { + "epoch": 3.762135922330097, + "grad_norm": 0.3765713509544661, + "learning_rate": 1.7537262409807476e-06, + "loss": 0.3695, + "step": 5425 + }, + { + "epoch": 3.7628294036061027, + "grad_norm": 0.37706643784277205, + "learning_rate": 1.7518855024489095e-06, + "loss": 0.3742, + "step": 5426 + }, + { + "epoch": 3.763522884882108, + "grad_norm": 0.3951727712955745, + "learning_rate": 1.7500455252515868e-06, + "loss": 0.3982, + "step": 5427 + }, + { + "epoch": 3.764216366158114, + "grad_norm": 0.4248050806632716, + "learning_rate": 1.7482063098200547e-06, + "loss": 0.361, + "step": 5428 + }, + { + "epoch": 3.764909847434119, + "grad_norm": 0.5283979933592003, + "learning_rate": 1.7463678565854126e-06, + "loss": 0.3808, + "step": 5429 + }, + { + "epoch": 3.765603328710125, + "grad_norm": 0.3972254502668182, + "learning_rate": 1.744530165978583e-06, + "loss": 0.3809, + "step": 5430 + }, + { + "epoch": 3.7662968099861303, + "grad_norm": 0.368240759054133, + "learning_rate": 1.742693238430303e-06, + "loss": 0.3829, + "step": 5431 + }, + { + "epoch": 3.766990291262136, + "grad_norm": 0.4037095185818751, + "learning_rate": 1.7408570743711394e-06, + "loss": 0.405, + "step": 5432 + }, + { + "epoch": 3.7676837725381414, + "grad_norm": 0.35559829706158025, + "learning_rate": 1.739021674231472e-06, + "loss": 0.3601, + "step": 5433 + }, + { + "epoch": 3.768377253814147, + "grad_norm": 0.38383401348562074, + "learning_rate": 1.7371870384415056e-06, + "loss": 0.3845, + "step": 5434 + }, + { + "epoch": 3.7690707350901524, + "grad_norm": 0.48961010708711433, + "learning_rate": 1.7353531674312702e-06, + "loss": 0.3369, + "step": 5435 + }, + { + "epoch": 3.769764216366158, + "grad_norm": 0.5793593113058321, + "learning_rate": 1.733520061630607e-06, + "loss": 0.3644, + "step": 5436 + }, + { + "epoch": 3.7704576976421635, + "grad_norm": 0.4257211794162759, + "learning_rate": 1.7316877214691863e-06, + "loss": 0.4077, + "step": 5437 + }, + { + "epoch": 3.7711511789181693, + "grad_norm": 0.5665832728275388, + "learning_rate": 1.7298561473764913e-06, + "loss": 0.3676, + "step": 5438 + }, + { + "epoch": 3.7718446601941746, + "grad_norm": 0.8191280650022053, + "learning_rate": 1.7280253397818319e-06, + "loss": 0.3684, + "step": 5439 + }, + { + "epoch": 3.7725381414701804, + "grad_norm": 0.6408806565485978, + "learning_rate": 1.7261952991143383e-06, + "loss": 0.3855, + "step": 5440 + }, + { + "epoch": 3.7732316227461857, + "grad_norm": 0.34447507846391334, + "learning_rate": 1.7243660258029543e-06, + "loss": 0.3373, + "step": 5441 + }, + { + "epoch": 3.7739251040221915, + "grad_norm": 0.3752033900856243, + "learning_rate": 1.722537520276451e-06, + "loss": 0.3965, + "step": 5442 + }, + { + "epoch": 3.774618585298197, + "grad_norm": 0.3775413093957538, + "learning_rate": 1.7207097829634168e-06, + "loss": 0.3541, + "step": 5443 + }, + { + "epoch": 3.7753120665742026, + "grad_norm": 0.3486904471046564, + "learning_rate": 1.7188828142922586e-06, + "loss": 0.3514, + "step": 5444 + }, + { + "epoch": 3.776005547850208, + "grad_norm": 0.37464889394532563, + "learning_rate": 1.7170566146912083e-06, + "loss": 0.3729, + "step": 5445 + }, + { + "epoch": 3.7766990291262137, + "grad_norm": 0.3610622241819802, + "learning_rate": 1.7152311845883096e-06, + "loss": 0.3462, + "step": 5446 + }, + { + "epoch": 3.777392510402219, + "grad_norm": 0.39130213083198795, + "learning_rate": 1.7134065244114318e-06, + "loss": 0.38, + "step": 5447 + }, + { + "epoch": 3.778085991678225, + "grad_norm": 0.39809161010492583, + "learning_rate": 1.7115826345882635e-06, + "loss": 0.3747, + "step": 5448 + }, + { + "epoch": 3.77877947295423, + "grad_norm": 0.40575456218582356, + "learning_rate": 1.7097595155463082e-06, + "loss": 0.3248, + "step": 5449 + }, + { + "epoch": 3.779472954230236, + "grad_norm": 0.3737679094783724, + "learning_rate": 1.7079371677128937e-06, + "loss": 0.3832, + "step": 5450 + }, + { + "epoch": 3.780166435506241, + "grad_norm": 0.4116349378860452, + "learning_rate": 1.706115591515166e-06, + "loss": 0.3867, + "step": 5451 + }, + { + "epoch": 3.780859916782247, + "grad_norm": 0.400088369393815, + "learning_rate": 1.7042947873800853e-06, + "loss": 0.3699, + "step": 5452 + }, + { + "epoch": 3.7815533980582523, + "grad_norm": 0.4285870819140972, + "learning_rate": 1.7024747557344411e-06, + "loss": 0.3582, + "step": 5453 + }, + { + "epoch": 3.782246879334258, + "grad_norm": 0.46475979435347403, + "learning_rate": 1.7006554970048305e-06, + "loss": 0.3912, + "step": 5454 + }, + { + "epoch": 3.7829403606102634, + "grad_norm": 0.38994414406106287, + "learning_rate": 1.6988370116176766e-06, + "loss": 0.3443, + "step": 5455 + }, + { + "epoch": 3.783633841886269, + "grad_norm": 0.3676251739064922, + "learning_rate": 1.6970192999992209e-06, + "loss": 0.376, + "step": 5456 + }, + { + "epoch": 3.7843273231622745, + "grad_norm": 0.3766643128839153, + "learning_rate": 1.6952023625755176e-06, + "loss": 0.4045, + "step": 5457 + }, + { + "epoch": 3.7850208044382803, + "grad_norm": 0.421874601290078, + "learning_rate": 1.6933861997724466e-06, + "loss": 0.3826, + "step": 5458 + }, + { + "epoch": 3.7857142857142856, + "grad_norm": 0.3950155661205429, + "learning_rate": 1.6915708120157042e-06, + "loss": 0.4299, + "step": 5459 + }, + { + "epoch": 3.7864077669902914, + "grad_norm": 0.3855825900703638, + "learning_rate": 1.6897561997308015e-06, + "loss": 0.4045, + "step": 5460 + }, + { + "epoch": 3.7871012482662967, + "grad_norm": 0.41393549300354765, + "learning_rate": 1.6879423633430708e-06, + "loss": 0.3766, + "step": 5461 + }, + { + "epoch": 3.7877947295423025, + "grad_norm": 0.41426592591495687, + "learning_rate": 1.6861293032776637e-06, + "loss": 0.3884, + "step": 5462 + }, + { + "epoch": 3.7884882108183078, + "grad_norm": 0.3961452540539228, + "learning_rate": 1.6843170199595476e-06, + "loss": 0.396, + "step": 5463 + }, + { + "epoch": 3.7891816920943135, + "grad_norm": 0.386811719574885, + "learning_rate": 1.6825055138135105e-06, + "loss": 0.3322, + "step": 5464 + }, + { + "epoch": 3.789875173370319, + "grad_norm": 0.3992245548766229, + "learning_rate": 1.680694785264153e-06, + "loss": 0.4281, + "step": 5465 + }, + { + "epoch": 3.7905686546463246, + "grad_norm": 0.45470336572811776, + "learning_rate": 1.6788848347358977e-06, + "loss": 0.3562, + "step": 5466 + }, + { + "epoch": 3.79126213592233, + "grad_norm": 0.40341159288487516, + "learning_rate": 1.6770756626529866e-06, + "loss": 0.3561, + "step": 5467 + }, + { + "epoch": 3.7919556171983357, + "grad_norm": 0.38860531381598035, + "learning_rate": 1.675267269439473e-06, + "loss": 0.3702, + "step": 5468 + }, + { + "epoch": 3.792649098474341, + "grad_norm": 0.3830640348037389, + "learning_rate": 1.6734596555192323e-06, + "loss": 0.309, + "step": 5469 + }, + { + "epoch": 3.793342579750347, + "grad_norm": 0.3981754230115332, + "learning_rate": 1.671652821315956e-06, + "loss": 0.4194, + "step": 5470 + }, + { + "epoch": 3.794036061026352, + "grad_norm": 0.41845316099717067, + "learning_rate": 1.6698467672531538e-06, + "loss": 0.3983, + "step": 5471 + }, + { + "epoch": 3.794729542302358, + "grad_norm": 0.3614566715835641, + "learning_rate": 1.6680414937541528e-06, + "loss": 0.3638, + "step": 5472 + }, + { + "epoch": 3.7954230235783633, + "grad_norm": 0.3959607555342449, + "learning_rate": 1.6662370012420931e-06, + "loss": 0.3792, + "step": 5473 + }, + { + "epoch": 3.796116504854369, + "grad_norm": 0.40082331373332036, + "learning_rate": 1.6644332901399357e-06, + "loss": 0.349, + "step": 5474 + }, + { + "epoch": 3.7968099861303743, + "grad_norm": 0.4534061975835003, + "learning_rate": 1.6626303608704597e-06, + "loss": 0.4056, + "step": 5475 + }, + { + "epoch": 3.79750346740638, + "grad_norm": 0.354771762862086, + "learning_rate": 1.6608282138562554e-06, + "loss": 0.3616, + "step": 5476 + }, + { + "epoch": 3.7981969486823854, + "grad_norm": 0.5109905447615416, + "learning_rate": 1.6590268495197354e-06, + "loss": 0.3448, + "step": 5477 + }, + { + "epoch": 3.798890429958391, + "grad_norm": 0.38350273614016517, + "learning_rate": 1.6572262682831241e-06, + "loss": 0.3924, + "step": 5478 + }, + { + "epoch": 3.7995839112343965, + "grad_norm": 0.5148563619196473, + "learning_rate": 1.655426470568464e-06, + "loss": 0.3874, + "step": 5479 + }, + { + "epoch": 3.8002773925104023, + "grad_norm": 0.4039022629715126, + "learning_rate": 1.6536274567976202e-06, + "loss": 0.3808, + "step": 5480 + }, + { + "epoch": 3.8009708737864076, + "grad_norm": 0.3649295170517451, + "learning_rate": 1.6518292273922631e-06, + "loss": 0.325, + "step": 5481 + }, + { + "epoch": 3.8016643550624134, + "grad_norm": 0.460181294015958, + "learning_rate": 1.6500317827738887e-06, + "loss": 0.3713, + "step": 5482 + }, + { + "epoch": 3.8023578363384187, + "grad_norm": 0.3983541745056531, + "learning_rate": 1.6482351233638006e-06, + "loss": 0.3985, + "step": 5483 + }, + { + "epoch": 3.8030513176144245, + "grad_norm": 0.4264831632946882, + "learning_rate": 1.6464392495831254e-06, + "loss": 0.4347, + "step": 5484 + }, + { + "epoch": 3.80374479889043, + "grad_norm": 0.35874030024608955, + "learning_rate": 1.6446441618528037e-06, + "loss": 0.351, + "step": 5485 + }, + { + "epoch": 3.8044382801664356, + "grad_norm": 0.4704866595577384, + "learning_rate": 1.6428498605935884e-06, + "loss": 0.4184, + "step": 5486 + }, + { + "epoch": 3.805131761442441, + "grad_norm": 0.37762854782032007, + "learning_rate": 1.641056346226052e-06, + "loss": 0.3565, + "step": 5487 + }, + { + "epoch": 3.8058252427184467, + "grad_norm": 0.3791613377168742, + "learning_rate": 1.6392636191705818e-06, + "loss": 0.3844, + "step": 5488 + }, + { + "epoch": 3.806518723994452, + "grad_norm": 0.5263251153146938, + "learning_rate": 1.6374716798473795e-06, + "loss": 0.379, + "step": 5489 + }, + { + "epoch": 3.807212205270458, + "grad_norm": 0.396715833344976, + "learning_rate": 1.6356805286764644e-06, + "loss": 0.3649, + "step": 5490 + }, + { + "epoch": 3.807905686546463, + "grad_norm": 0.39121884866498163, + "learning_rate": 1.6338901660776662e-06, + "loss": 0.3606, + "step": 5491 + }, + { + "epoch": 3.808599167822469, + "grad_norm": 0.40171467739184663, + "learning_rate": 1.6321005924706346e-06, + "loss": 0.3598, + "step": 5492 + }, + { + "epoch": 3.809292649098474, + "grad_norm": 0.38101576787076014, + "learning_rate": 1.6303118082748342e-06, + "loss": 0.3951, + "step": 5493 + }, + { + "epoch": 3.80998613037448, + "grad_norm": 0.429169630095379, + "learning_rate": 1.62852381390954e-06, + "loss": 0.3907, + "step": 5494 + }, + { + "epoch": 3.8106796116504853, + "grad_norm": 0.39039933858681025, + "learning_rate": 1.6267366097938464e-06, + "loss": 0.3363, + "step": 5495 + }, + { + "epoch": 3.811373092926491, + "grad_norm": 0.4236816842037406, + "learning_rate": 1.624950196346663e-06, + "loss": 0.3463, + "step": 5496 + }, + { + "epoch": 3.8120665742024964, + "grad_norm": 0.39379346995752024, + "learning_rate": 1.6231645739867062e-06, + "loss": 0.3396, + "step": 5497 + }, + { + "epoch": 3.812760055478502, + "grad_norm": 0.39719999992746835, + "learning_rate": 1.6213797431325212e-06, + "loss": 0.3458, + "step": 5498 + }, + { + "epoch": 3.8134535367545075, + "grad_norm": 0.3961760109207296, + "learning_rate": 1.6195957042024536e-06, + "loss": 0.4086, + "step": 5499 + }, + { + "epoch": 3.8141470180305133, + "grad_norm": 0.37441061629038924, + "learning_rate": 1.6178124576146708e-06, + "loss": 0.3603, + "step": 5500 + }, + { + "epoch": 3.8148404993065186, + "grad_norm": 0.3727463774789075, + "learning_rate": 1.6160300037871547e-06, + "loss": 0.3843, + "step": 5501 + }, + { + "epoch": 3.8155339805825244, + "grad_norm": 0.38891618422764807, + "learning_rate": 1.6142483431376959e-06, + "loss": 0.333, + "step": 5502 + }, + { + "epoch": 3.8162274618585297, + "grad_norm": 0.416956163142673, + "learning_rate": 1.612467476083905e-06, + "loss": 0.39, + "step": 5503 + }, + { + "epoch": 3.8169209431345354, + "grad_norm": 0.3937672991999547, + "learning_rate": 1.610687403043205e-06, + "loss": 0.375, + "step": 5504 + }, + { + "epoch": 3.8176144244105408, + "grad_norm": 0.3668701628736196, + "learning_rate": 1.6089081244328285e-06, + "loss": 0.3484, + "step": 5505 + }, + { + "epoch": 3.8183079056865465, + "grad_norm": 0.38609486838441315, + "learning_rate": 1.6071296406698317e-06, + "loss": 0.4038, + "step": 5506 + }, + { + "epoch": 3.819001386962552, + "grad_norm": 0.4121516078389864, + "learning_rate": 1.6053519521710726e-06, + "loss": 0.385, + "step": 5507 + }, + { + "epoch": 3.8196948682385576, + "grad_norm": 0.3967922793826194, + "learning_rate": 1.6035750593532312e-06, + "loss": 0.3684, + "step": 5508 + }, + { + "epoch": 3.820388349514563, + "grad_norm": 0.46902752964958344, + "learning_rate": 1.601798962632799e-06, + "loss": 0.3707, + "step": 5509 + }, + { + "epoch": 3.8210818307905687, + "grad_norm": 0.3973459652136806, + "learning_rate": 1.600023662426078e-06, + "loss": 0.3653, + "step": 5510 + }, + { + "epoch": 3.821775312066574, + "grad_norm": 0.4770230619345637, + "learning_rate": 1.5982491591491861e-06, + "loss": 0.3606, + "step": 5511 + }, + { + "epoch": 3.82246879334258, + "grad_norm": 0.4368077722486454, + "learning_rate": 1.5964754532180564e-06, + "loss": 0.3884, + "step": 5512 + }, + { + "epoch": 3.823162274618585, + "grad_norm": 0.3862600988526136, + "learning_rate": 1.59470254504843e-06, + "loss": 0.3755, + "step": 5513 + }, + { + "epoch": 3.823855755894591, + "grad_norm": 0.6897639021099272, + "learning_rate": 1.592930435055864e-06, + "loss": 0.3878, + "step": 5514 + }, + { + "epoch": 3.8245492371705962, + "grad_norm": 0.39283478602548016, + "learning_rate": 1.5911591236557288e-06, + "loss": 0.3815, + "step": 5515 + }, + { + "epoch": 3.825242718446602, + "grad_norm": 0.4040414188632428, + "learning_rate": 1.589388611263208e-06, + "loss": 0.4013, + "step": 5516 + }, + { + "epoch": 3.8259361997226073, + "grad_norm": 0.6232878797513218, + "learning_rate": 1.5876188982932966e-06, + "loss": 0.3434, + "step": 5517 + }, + { + "epoch": 3.826629680998613, + "grad_norm": 0.39002720526485685, + "learning_rate": 1.5858499851608006e-06, + "loss": 0.4065, + "step": 5518 + }, + { + "epoch": 3.8273231622746184, + "grad_norm": 0.3841094765149276, + "learning_rate": 1.5840818722803413e-06, + "loss": 0.3786, + "step": 5519 + }, + { + "epoch": 3.828016643550624, + "grad_norm": 0.3968586001557126, + "learning_rate": 1.5823145600663536e-06, + "loss": 0.3983, + "step": 5520 + }, + { + "epoch": 3.8287101248266295, + "grad_norm": 0.40974053765869695, + "learning_rate": 1.5805480489330798e-06, + "loss": 0.4008, + "step": 5521 + }, + { + "epoch": 3.8294036061026353, + "grad_norm": 0.41412119996826335, + "learning_rate": 1.5787823392945794e-06, + "loss": 0.3833, + "step": 5522 + }, + { + "epoch": 3.8300970873786406, + "grad_norm": 0.3735504340365167, + "learning_rate": 1.5770174315647185e-06, + "loss": 0.361, + "step": 5523 + }, + { + "epoch": 3.8307905686546464, + "grad_norm": 0.42026566451047853, + "learning_rate": 1.575253326157183e-06, + "loss": 0.3776, + "step": 5524 + }, + { + "epoch": 3.8314840499306517, + "grad_norm": 0.4014227174885103, + "learning_rate": 1.5734900234854655e-06, + "loss": 0.4175, + "step": 5525 + }, + { + "epoch": 3.8321775312066575, + "grad_norm": 0.5201046773001922, + "learning_rate": 1.5717275239628693e-06, + "loss": 0.413, + "step": 5526 + }, + { + "epoch": 3.832871012482663, + "grad_norm": 0.38269945858021276, + "learning_rate": 1.5699658280025143e-06, + "loss": 0.3735, + "step": 5527 + }, + { + "epoch": 3.8335644937586686, + "grad_norm": 0.40546504630034674, + "learning_rate": 1.5682049360173263e-06, + "loss": 0.3789, + "step": 5528 + }, + { + "epoch": 3.834257975034674, + "grad_norm": 0.4188339497934117, + "learning_rate": 1.5664448484200468e-06, + "loss": 0.3659, + "step": 5529 + }, + { + "epoch": 3.8349514563106797, + "grad_norm": 0.43039825713556573, + "learning_rate": 1.5646855656232296e-06, + "loss": 0.4021, + "step": 5530 + }, + { + "epoch": 3.835644937586685, + "grad_norm": 0.4469033144459223, + "learning_rate": 1.5629270880392345e-06, + "loss": 0.4061, + "step": 5531 + }, + { + "epoch": 3.836338418862691, + "grad_norm": 0.36411846985312857, + "learning_rate": 1.5611694160802377e-06, + "loss": 0.3798, + "step": 5532 + }, + { + "epoch": 3.837031900138696, + "grad_norm": 0.4165456791596007, + "learning_rate": 1.5594125501582241e-06, + "loss": 0.3771, + "step": 5533 + }, + { + "epoch": 3.837725381414702, + "grad_norm": 0.4798051615631476, + "learning_rate": 1.5576564906849918e-06, + "loss": 0.4217, + "step": 5534 + }, + { + "epoch": 3.838418862690707, + "grad_norm": 0.4654707514678164, + "learning_rate": 1.5559012380721484e-06, + "loss": 0.4034, + "step": 5535 + }, + { + "epoch": 3.839112343966713, + "grad_norm": 0.43916135815319507, + "learning_rate": 1.5541467927311093e-06, + "loss": 0.3637, + "step": 5536 + }, + { + "epoch": 3.8398058252427183, + "grad_norm": 0.40499701686065925, + "learning_rate": 1.552393155073107e-06, + "loss": 0.3487, + "step": 5537 + }, + { + "epoch": 3.840499306518724, + "grad_norm": 0.5294692785153393, + "learning_rate": 1.5506403255091812e-06, + "loss": 0.335, + "step": 5538 + }, + { + "epoch": 3.8411927877947294, + "grad_norm": 0.39026294615982876, + "learning_rate": 1.5488883044501807e-06, + "loss": 0.3956, + "step": 5539 + }, + { + "epoch": 3.841886269070735, + "grad_norm": 0.368752034052551, + "learning_rate": 1.5471370923067668e-06, + "loss": 0.3897, + "step": 5540 + }, + { + "epoch": 3.8425797503467405, + "grad_norm": 0.40896760470203153, + "learning_rate": 1.5453866894894126e-06, + "loss": 0.3944, + "step": 5541 + }, + { + "epoch": 3.8432732316227463, + "grad_norm": 0.4001042563506018, + "learning_rate": 1.543637096408398e-06, + "loss": 0.3849, + "step": 5542 + }, + { + "epoch": 3.8439667128987516, + "grad_norm": 0.4139012941974305, + "learning_rate": 1.5418883134738178e-06, + "loss": 0.3809, + "step": 5543 + }, + { + "epoch": 3.8446601941747574, + "grad_norm": 0.3973421077339701, + "learning_rate": 1.5401403410955707e-06, + "loss": 0.4022, + "step": 5544 + }, + { + "epoch": 3.8453536754507627, + "grad_norm": 0.37951204611751804, + "learning_rate": 1.5383931796833702e-06, + "loss": 0.3424, + "step": 5545 + }, + { + "epoch": 3.8460471567267684, + "grad_norm": 0.4156068002622164, + "learning_rate": 1.5366468296467397e-06, + "loss": 0.4001, + "step": 5546 + }, + { + "epoch": 3.8467406380027738, + "grad_norm": 0.37240364982023866, + "learning_rate": 1.534901291395008e-06, + "loss": 0.3422, + "step": 5547 + }, + { + "epoch": 3.8474341192787795, + "grad_norm": 0.37031637090926967, + "learning_rate": 1.5331565653373176e-06, + "loss": 0.3451, + "step": 5548 + }, + { + "epoch": 3.848127600554785, + "grad_norm": 0.4410069310255653, + "learning_rate": 1.5314126518826222e-06, + "loss": 0.3871, + "step": 5549 + }, + { + "epoch": 3.8488210818307906, + "grad_norm": 0.4299440737340537, + "learning_rate": 1.5296695514396776e-06, + "loss": 0.4399, + "step": 5550 + }, + { + "epoch": 3.849514563106796, + "grad_norm": 0.9455654023015365, + "learning_rate": 1.52792726441706e-06, + "loss": 0.4149, + "step": 5551 + }, + { + "epoch": 3.8502080443828017, + "grad_norm": 0.3790694636307669, + "learning_rate": 1.5261857912231438e-06, + "loss": 0.3137, + "step": 5552 + }, + { + "epoch": 3.850901525658807, + "grad_norm": 0.4147510509227642, + "learning_rate": 1.5244451322661197e-06, + "loss": 0.3942, + "step": 5553 + }, + { + "epoch": 3.851595006934813, + "grad_norm": 0.43063851256519503, + "learning_rate": 1.5227052879539872e-06, + "loss": 0.3755, + "step": 5554 + }, + { + "epoch": 3.852288488210818, + "grad_norm": 0.45412017914645153, + "learning_rate": 1.5209662586945496e-06, + "loss": 0.414, + "step": 5555 + }, + { + "epoch": 3.852981969486824, + "grad_norm": 0.4675562645996263, + "learning_rate": 1.5192280448954244e-06, + "loss": 0.3895, + "step": 5556 + }, + { + "epoch": 3.8536754507628292, + "grad_norm": 0.39443791189051286, + "learning_rate": 1.5174906469640387e-06, + "loss": 0.381, + "step": 5557 + }, + { + "epoch": 3.854368932038835, + "grad_norm": 0.38723469162640956, + "learning_rate": 1.515754065307622e-06, + "loss": 0.3542, + "step": 5558 + }, + { + "epoch": 3.8550624133148403, + "grad_norm": 0.4032888749927211, + "learning_rate": 1.5140183003332182e-06, + "loss": 0.3902, + "step": 5559 + }, + { + "epoch": 3.855755894590846, + "grad_norm": 0.43150375891932785, + "learning_rate": 1.5122833524476782e-06, + "loss": 0.3759, + "step": 5560 + }, + { + "epoch": 3.8564493758668514, + "grad_norm": 0.37765765952643526, + "learning_rate": 1.5105492220576612e-06, + "loss": 0.4069, + "step": 5561 + }, + { + "epoch": 3.857142857142857, + "grad_norm": 0.4376355760621776, + "learning_rate": 1.5088159095696365e-06, + "loss": 0.3218, + "step": 5562 + }, + { + "epoch": 3.8578363384188625, + "grad_norm": 0.41721282862940745, + "learning_rate": 1.5070834153898766e-06, + "loss": 0.3419, + "step": 5563 + }, + { + "epoch": 3.8585298196948683, + "grad_norm": 0.35353951227298963, + "learning_rate": 1.5053517399244672e-06, + "loss": 0.3654, + "step": 5564 + }, + { + "epoch": 3.8592233009708736, + "grad_norm": 0.40098806127352365, + "learning_rate": 1.503620883579302e-06, + "loss": 0.3776, + "step": 5565 + }, + { + "epoch": 3.8599167822468794, + "grad_norm": 0.4240605348191167, + "learning_rate": 1.5018908467600778e-06, + "loss": 0.3789, + "step": 5566 + }, + { + "epoch": 3.8606102635228847, + "grad_norm": 0.3803201875493985, + "learning_rate": 1.500161629872307e-06, + "loss": 0.4193, + "step": 5567 + }, + { + "epoch": 3.8613037447988905, + "grad_norm": 0.4062339254914316, + "learning_rate": 1.4984332333212998e-06, + "loss": 0.4119, + "step": 5568 + }, + { + "epoch": 3.861997226074896, + "grad_norm": 0.4358161580514588, + "learning_rate": 1.4967056575121842e-06, + "loss": 0.4269, + "step": 5569 + }, + { + "epoch": 3.8626907073509016, + "grad_norm": 0.5367904873711541, + "learning_rate": 1.4949789028498923e-06, + "loss": 0.3777, + "step": 5570 + }, + { + "epoch": 3.863384188626907, + "grad_norm": 0.39838761560473596, + "learning_rate": 1.4932529697391596e-06, + "loss": 0.3914, + "step": 5571 + }, + { + "epoch": 3.8640776699029127, + "grad_norm": 0.4300381966464757, + "learning_rate": 1.491527858584535e-06, + "loss": 0.3728, + "step": 5572 + }, + { + "epoch": 3.864771151178918, + "grad_norm": 0.4246401021355444, + "learning_rate": 1.4898035697903694e-06, + "loss": 0.3806, + "step": 5573 + }, + { + "epoch": 3.8654646324549238, + "grad_norm": 0.3955762135866378, + "learning_rate": 1.488080103760825e-06, + "loss": 0.378, + "step": 5574 + }, + { + "epoch": 3.866158113730929, + "grad_norm": 0.37439770032132935, + "learning_rate": 1.4863574608998716e-06, + "loss": 0.3427, + "step": 5575 + }, + { + "epoch": 3.866851595006935, + "grad_norm": 0.4127301950287792, + "learning_rate": 1.4846356416112805e-06, + "loss": 0.4105, + "step": 5576 + }, + { + "epoch": 3.86754507628294, + "grad_norm": 0.4176112273482195, + "learning_rate": 1.4829146462986354e-06, + "loss": 0.3045, + "step": 5577 + }, + { + "epoch": 3.868238557558946, + "grad_norm": 0.3920737918779999, + "learning_rate": 1.4811944753653256e-06, + "loss": 0.3682, + "step": 5578 + }, + { + "epoch": 3.8689320388349513, + "grad_norm": 0.41651613206030125, + "learning_rate": 1.4794751292145465e-06, + "loss": 0.4184, + "step": 5579 + }, + { + "epoch": 3.869625520110957, + "grad_norm": 0.3790228975308277, + "learning_rate": 1.4777566082493017e-06, + "loss": 0.3893, + "step": 5580 + }, + { + "epoch": 3.8703190013869624, + "grad_norm": 0.43957142965703966, + "learning_rate": 1.4760389128723968e-06, + "loss": 0.4308, + "step": 5581 + }, + { + "epoch": 3.871012482662968, + "grad_norm": 0.37992295707935186, + "learning_rate": 1.4743220434864492e-06, + "loss": 0.3832, + "step": 5582 + }, + { + "epoch": 3.8717059639389735, + "grad_norm": 0.419379184564309, + "learning_rate": 1.4726060004938819e-06, + "loss": 0.4206, + "step": 5583 + }, + { + "epoch": 3.8723994452149793, + "grad_norm": 0.42429959323969607, + "learning_rate": 1.47089078429692e-06, + "loss": 0.3823, + "step": 5584 + }, + { + "epoch": 3.8730929264909846, + "grad_norm": 0.36699942270029495, + "learning_rate": 1.4691763952975996e-06, + "loss": 0.3717, + "step": 5585 + }, + { + "epoch": 3.8737864077669903, + "grad_norm": 0.3875732509319577, + "learning_rate": 1.4674628338977604e-06, + "loss": 0.3553, + "step": 5586 + }, + { + "epoch": 3.8744798890429957, + "grad_norm": 0.3850202003941385, + "learning_rate": 1.4657501004990488e-06, + "loss": 0.364, + "step": 5587 + }, + { + "epoch": 3.8751733703190014, + "grad_norm": 1.5284101176859395, + "learning_rate": 1.4640381955029193e-06, + "loss": 0.4049, + "step": 5588 + }, + { + "epoch": 3.875866851595007, + "grad_norm": 0.38956167245449974, + "learning_rate": 1.4623271193106264e-06, + "loss": 0.3588, + "step": 5589 + }, + { + "epoch": 3.8765603328710125, + "grad_norm": 0.40227087344775136, + "learning_rate": 1.460616872323236e-06, + "loss": 0.3976, + "step": 5590 + }, + { + "epoch": 3.877253814147018, + "grad_norm": 0.3940537743117503, + "learning_rate": 1.4589074549416188e-06, + "loss": 0.3805, + "step": 5591 + }, + { + "epoch": 3.8779472954230236, + "grad_norm": 0.41411441939648697, + "learning_rate": 1.4571988675664467e-06, + "loss": 0.4063, + "step": 5592 + }, + { + "epoch": 3.8786407766990294, + "grad_norm": 0.5979411231554002, + "learning_rate": 1.4554911105982022e-06, + "loss": 0.3726, + "step": 5593 + }, + { + "epoch": 3.8793342579750347, + "grad_norm": 0.36969918434948273, + "learning_rate": 1.4537841844371719e-06, + "loss": 0.3836, + "step": 5594 + }, + { + "epoch": 3.88002773925104, + "grad_norm": 0.40293111629174605, + "learning_rate": 1.452078089483443e-06, + "loss": 0.3681, + "step": 5595 + }, + { + "epoch": 3.880721220527046, + "grad_norm": 0.4069882307746121, + "learning_rate": 1.4503728261369176e-06, + "loss": 0.3766, + "step": 5596 + }, + { + "epoch": 3.8814147018030516, + "grad_norm": 0.3714067860738447, + "learning_rate": 1.448668394797293e-06, + "loss": 0.3473, + "step": 5597 + }, + { + "epoch": 3.882108183079057, + "grad_norm": 0.4829169868726462, + "learning_rate": 1.4469647958640758e-06, + "loss": 0.3978, + "step": 5598 + }, + { + "epoch": 3.8828016643550622, + "grad_norm": 0.4325020462857424, + "learning_rate": 1.4452620297365804e-06, + "loss": 0.4641, + "step": 5599 + }, + { + "epoch": 3.883495145631068, + "grad_norm": 0.3964269076699933, + "learning_rate": 1.4435600968139192e-06, + "loss": 0.4125, + "step": 5600 + }, + { + "epoch": 3.884188626907074, + "grad_norm": 0.3801322339862502, + "learning_rate": 1.4418589974950142e-06, + "loss": 0.3813, + "step": 5601 + }, + { + "epoch": 3.884882108183079, + "grad_norm": 1.120420651707417, + "learning_rate": 1.4401587321785927e-06, + "loss": 0.3464, + "step": 5602 + }, + { + "epoch": 3.8855755894590844, + "grad_norm": 0.3850211652434562, + "learning_rate": 1.438459301263181e-06, + "loss": 0.3426, + "step": 5603 + }, + { + "epoch": 3.88626907073509, + "grad_norm": 0.5067998945282288, + "learning_rate": 1.436760705147115e-06, + "loss": 0.4075, + "step": 5604 + }, + { + "epoch": 3.886962552011096, + "grad_norm": 0.4173378607138257, + "learning_rate": 1.4350629442285336e-06, + "loss": 0.3854, + "step": 5605 + }, + { + "epoch": 3.8876560332871013, + "grad_norm": 0.38862495150285326, + "learning_rate": 1.4333660189053794e-06, + "loss": 0.3792, + "step": 5606 + }, + { + "epoch": 3.8883495145631066, + "grad_norm": 0.449690086866131, + "learning_rate": 1.4316699295754016e-06, + "loss": 0.38, + "step": 5607 + }, + { + "epoch": 3.8890429958391124, + "grad_norm": 1.0670879888431246, + "learning_rate": 1.4299746766361477e-06, + "loss": 0.3595, + "step": 5608 + }, + { + "epoch": 3.889736477115118, + "grad_norm": 0.5401108553541221, + "learning_rate": 1.4282802604849754e-06, + "loss": 0.4003, + "step": 5609 + }, + { + "epoch": 3.8904299583911235, + "grad_norm": 0.3794215135031165, + "learning_rate": 1.426586681519041e-06, + "loss": 0.3628, + "step": 5610 + }, + { + "epoch": 3.891123439667129, + "grad_norm": 0.37930390905488165, + "learning_rate": 1.424893940135309e-06, + "loss": 0.3797, + "step": 5611 + }, + { + "epoch": 3.8918169209431346, + "grad_norm": 0.48747810963585964, + "learning_rate": 1.4232020367305466e-06, + "loss": 0.3522, + "step": 5612 + }, + { + "epoch": 3.8925104022191404, + "grad_norm": 0.3823598611920294, + "learning_rate": 1.4215109717013193e-06, + "loss": 0.3676, + "step": 5613 + }, + { + "epoch": 3.8932038834951457, + "grad_norm": 0.3884920884811159, + "learning_rate": 1.4198207454440048e-06, + "loss": 0.3588, + "step": 5614 + }, + { + "epoch": 3.893897364771151, + "grad_norm": 0.42249768676696087, + "learning_rate": 1.4181313583547807e-06, + "loss": 0.4018, + "step": 5615 + }, + { + "epoch": 3.8945908460471568, + "grad_norm": 0.3855575000463757, + "learning_rate": 1.416442810829623e-06, + "loss": 0.3596, + "step": 5616 + }, + { + "epoch": 3.8952843273231625, + "grad_norm": 0.435829374064413, + "learning_rate": 1.4147551032643192e-06, + "loss": 0.3645, + "step": 5617 + }, + { + "epoch": 3.895977808599168, + "grad_norm": 0.6052173329254579, + "learning_rate": 1.4130682360544518e-06, + "loss": 0.4031, + "step": 5618 + }, + { + "epoch": 3.896671289875173, + "grad_norm": 0.4072705047921821, + "learning_rate": 1.4113822095954122e-06, + "loss": 0.3953, + "step": 5619 + }, + { + "epoch": 3.897364771151179, + "grad_norm": 0.429474477464326, + "learning_rate": 1.4096970242823943e-06, + "loss": 0.3595, + "step": 5620 + }, + { + "epoch": 3.8980582524271847, + "grad_norm": 0.5287925548365876, + "learning_rate": 1.40801268051039e-06, + "loss": 0.3784, + "step": 5621 + }, + { + "epoch": 3.89875173370319, + "grad_norm": 0.3807454717369341, + "learning_rate": 1.406329178674199e-06, + "loss": 0.3722, + "step": 5622 + }, + { + "epoch": 3.8994452149791954, + "grad_norm": 0.38783373190927745, + "learning_rate": 1.4046465191684223e-06, + "loss": 0.3748, + "step": 5623 + }, + { + "epoch": 3.900138696255201, + "grad_norm": 0.4601025319851407, + "learning_rate": 1.4029647023874621e-06, + "loss": 0.4235, + "step": 5624 + }, + { + "epoch": 3.900832177531207, + "grad_norm": 0.38491422370673306, + "learning_rate": 1.4012837287255266e-06, + "loss": 0.3693, + "step": 5625 + }, + { + "epoch": 3.9015256588072122, + "grad_norm": 0.3699367415628008, + "learning_rate": 1.3996035985766205e-06, + "loss": 0.3546, + "step": 5626 + }, + { + "epoch": 3.9022191400832176, + "grad_norm": 0.3912934283652068, + "learning_rate": 1.3979243123345554e-06, + "loss": 0.3539, + "step": 5627 + }, + { + "epoch": 3.9029126213592233, + "grad_norm": 0.41547712306912676, + "learning_rate": 1.396245870392946e-06, + "loss": 0.3968, + "step": 5628 + }, + { + "epoch": 3.903606102635229, + "grad_norm": 0.41775988278850457, + "learning_rate": 1.3945682731452032e-06, + "loss": 0.3588, + "step": 5629 + }, + { + "epoch": 3.9042995839112344, + "grad_norm": 0.4095847301634584, + "learning_rate": 1.3928915209845451e-06, + "loss": 0.3949, + "step": 5630 + }, + { + "epoch": 3.9049930651872398, + "grad_norm": 0.3992895852152204, + "learning_rate": 1.3912156143039906e-06, + "loss": 0.4135, + "step": 5631 + }, + { + "epoch": 3.9056865464632455, + "grad_norm": 0.3959167841660771, + "learning_rate": 1.3895405534963607e-06, + "loss": 0.3448, + "step": 5632 + }, + { + "epoch": 3.9063800277392513, + "grad_norm": 0.40669169154979523, + "learning_rate": 1.3878663389542779e-06, + "loss": 0.3697, + "step": 5633 + }, + { + "epoch": 3.9070735090152566, + "grad_norm": 0.40659905161168647, + "learning_rate": 1.3861929710701633e-06, + "loss": 0.3569, + "step": 5634 + }, + { + "epoch": 3.907766990291262, + "grad_norm": 0.3980486492646797, + "learning_rate": 1.3845204502362442e-06, + "loss": 0.3224, + "step": 5635 + }, + { + "epoch": 3.9084604715672677, + "grad_norm": 0.38492900940580815, + "learning_rate": 1.3828487768445482e-06, + "loss": 0.3685, + "step": 5636 + }, + { + "epoch": 3.9091539528432735, + "grad_norm": 0.38464183219509007, + "learning_rate": 1.381177951286901e-06, + "loss": 0.3892, + "step": 5637 + }, + { + "epoch": 3.909847434119279, + "grad_norm": 0.38406230364296096, + "learning_rate": 1.3795079739549332e-06, + "loss": 0.3962, + "step": 5638 + }, + { + "epoch": 3.910540915395284, + "grad_norm": 0.3655593241734113, + "learning_rate": 1.377838845240077e-06, + "loss": 0.3683, + "step": 5639 + }, + { + "epoch": 3.91123439667129, + "grad_norm": 0.4028183338449733, + "learning_rate": 1.3761705655335595e-06, + "loss": 0.3823, + "step": 5640 + }, + { + "epoch": 3.9119278779472957, + "grad_norm": 0.37947637095830566, + "learning_rate": 1.37450313522642e-06, + "loss": 0.339, + "step": 5641 + }, + { + "epoch": 3.912621359223301, + "grad_norm": 0.41724287597837223, + "learning_rate": 1.3728365547094863e-06, + "loss": 0.3946, + "step": 5642 + }, + { + "epoch": 3.9133148404993063, + "grad_norm": 0.417172806676931, + "learning_rate": 1.3711708243733951e-06, + "loss": 0.3921, + "step": 5643 + }, + { + "epoch": 3.914008321775312, + "grad_norm": 0.35462495501465946, + "learning_rate": 1.369505944608583e-06, + "loss": 0.3398, + "step": 5644 + }, + { + "epoch": 3.914701803051318, + "grad_norm": 0.4499823715764414, + "learning_rate": 1.367841915805283e-06, + "loss": 0.4093, + "step": 5645 + }, + { + "epoch": 3.915395284327323, + "grad_norm": 0.4173737991050466, + "learning_rate": 1.3661787383535324e-06, + "loss": 0.411, + "step": 5646 + }, + { + "epoch": 3.9160887656033285, + "grad_norm": 0.43895724692895477, + "learning_rate": 1.3645164126431697e-06, + "loss": 0.3407, + "step": 5647 + }, + { + "epoch": 3.9167822468793343, + "grad_norm": 0.41000579747124954, + "learning_rate": 1.362854939063829e-06, + "loss": 0.3834, + "step": 5648 + }, + { + "epoch": 3.91747572815534, + "grad_norm": 0.48735253813451135, + "learning_rate": 1.3611943180049491e-06, + "loss": 0.4073, + "step": 5649 + }, + { + "epoch": 3.9181692094313454, + "grad_norm": 0.3858682781790578, + "learning_rate": 1.3595345498557677e-06, + "loss": 0.3388, + "step": 5650 + }, + { + "epoch": 3.9188626907073507, + "grad_norm": 0.47724094074796924, + "learning_rate": 1.3578756350053219e-06, + "loss": 0.3776, + "step": 5651 + }, + { + "epoch": 3.9195561719833565, + "grad_norm": 0.3862194568565286, + "learning_rate": 1.3562175738424515e-06, + "loss": 0.3993, + "step": 5652 + }, + { + "epoch": 3.9202496532593623, + "grad_norm": 0.41274317453823883, + "learning_rate": 1.3545603667557911e-06, + "loss": 0.3737, + "step": 5653 + }, + { + "epoch": 3.9209431345353676, + "grad_norm": 0.3887218217029318, + "learning_rate": 1.3529040141337801e-06, + "loss": 0.3912, + "step": 5654 + }, + { + "epoch": 3.921636615811373, + "grad_norm": 0.5143223803741318, + "learning_rate": 1.3512485163646537e-06, + "loss": 0.3523, + "step": 5655 + }, + { + "epoch": 3.9223300970873787, + "grad_norm": 0.43330889320819416, + "learning_rate": 1.3495938738364496e-06, + "loss": 0.3946, + "step": 5656 + }, + { + "epoch": 3.9230235783633844, + "grad_norm": 0.4027388474860819, + "learning_rate": 1.3479400869370052e-06, + "loss": 0.3851, + "step": 5657 + }, + { + "epoch": 3.9237170596393898, + "grad_norm": 0.42135575554887195, + "learning_rate": 1.346287156053952e-06, + "loss": 0.3982, + "step": 5658 + }, + { + "epoch": 3.924410540915395, + "grad_norm": 0.43683122299483046, + "learning_rate": 1.344635081574731e-06, + "loss": 0.4228, + "step": 5659 + }, + { + "epoch": 3.925104022191401, + "grad_norm": 0.45956794064458506, + "learning_rate": 1.3429838638865721e-06, + "loss": 0.3567, + "step": 5660 + }, + { + "epoch": 3.9257975034674066, + "grad_norm": 0.4253665212883945, + "learning_rate": 1.3413335033765102e-06, + "loss": 0.3552, + "step": 5661 + }, + { + "epoch": 3.926490984743412, + "grad_norm": 0.39429871151008744, + "learning_rate": 1.3396840004313789e-06, + "loss": 0.382, + "step": 5662 + }, + { + "epoch": 3.9271844660194173, + "grad_norm": 0.4259865437463772, + "learning_rate": 1.3380353554378074e-06, + "loss": 0.3401, + "step": 5663 + }, + { + "epoch": 3.927877947295423, + "grad_norm": 0.40489142307187015, + "learning_rate": 1.3363875687822276e-06, + "loss": 0.3548, + "step": 5664 + }, + { + "epoch": 3.928571428571429, + "grad_norm": 0.4493377903861885, + "learning_rate": 1.3347406408508695e-06, + "loss": 0.4119, + "step": 5665 + }, + { + "epoch": 3.929264909847434, + "grad_norm": 0.9507820720744213, + "learning_rate": 1.3330945720297594e-06, + "loss": 0.4008, + "step": 5666 + }, + { + "epoch": 3.9299583911234395, + "grad_norm": 0.5555360984295558, + "learning_rate": 1.3314493627047242e-06, + "loss": 0.4077, + "step": 5667 + }, + { + "epoch": 3.9306518723994452, + "grad_norm": 0.38394572692282086, + "learning_rate": 1.3298050132613893e-06, + "loss": 0.374, + "step": 5668 + }, + { + "epoch": 3.931345353675451, + "grad_norm": 0.4016585953908722, + "learning_rate": 1.3281615240851787e-06, + "loss": 0.335, + "step": 5669 + }, + { + "epoch": 3.9320388349514563, + "grad_norm": 0.3521420652161348, + "learning_rate": 1.3265188955613156e-06, + "loss": 0.3518, + "step": 5670 + }, + { + "epoch": 3.9327323162274617, + "grad_norm": 0.39605405300661, + "learning_rate": 1.3248771280748174e-06, + "loss": 0.3511, + "step": 5671 + }, + { + "epoch": 3.9334257975034674, + "grad_norm": 0.4060404594464979, + "learning_rate": 1.3232362220105038e-06, + "loss": 0.3475, + "step": 5672 + }, + { + "epoch": 3.934119278779473, + "grad_norm": 0.3945904332268029, + "learning_rate": 1.3215961777529928e-06, + "loss": 0.3639, + "step": 5673 + }, + { + "epoch": 3.9348127600554785, + "grad_norm": 0.6167005425479277, + "learning_rate": 1.3199569956866964e-06, + "loss": 0.3306, + "step": 5674 + }, + { + "epoch": 3.935506241331484, + "grad_norm": 0.4431107557237783, + "learning_rate": 1.3183186761958278e-06, + "loss": 0.3546, + "step": 5675 + }, + { + "epoch": 3.9361997226074896, + "grad_norm": 0.4196952370798847, + "learning_rate": 1.3166812196643974e-06, + "loss": 0.3617, + "step": 5676 + }, + { + "epoch": 3.9368932038834954, + "grad_norm": 0.5529102129775423, + "learning_rate": 1.3150446264762134e-06, + "loss": 0.3736, + "step": 5677 + }, + { + "epoch": 3.9375866851595007, + "grad_norm": 0.36804803724176843, + "learning_rate": 1.3134088970148828e-06, + "loss": 0.3351, + "step": 5678 + }, + { + "epoch": 3.938280166435506, + "grad_norm": 0.42122798419618446, + "learning_rate": 1.3117740316638055e-06, + "loss": 0.3655, + "step": 5679 + }, + { + "epoch": 3.938973647711512, + "grad_norm": 0.3950545471418325, + "learning_rate": 1.310140030806184e-06, + "loss": 0.3039, + "step": 5680 + }, + { + "epoch": 3.9396671289875176, + "grad_norm": 0.43208510261210614, + "learning_rate": 1.3085068948250174e-06, + "loss": 0.4075, + "step": 5681 + }, + { + "epoch": 3.940360610263523, + "grad_norm": 0.4096528815786458, + "learning_rate": 1.3068746241030983e-06, + "loss": 0.3788, + "step": 5682 + }, + { + "epoch": 3.9410540915395282, + "grad_norm": 0.39946228193954975, + "learning_rate": 1.3052432190230202e-06, + "loss": 0.4224, + "step": 5683 + }, + { + "epoch": 3.941747572815534, + "grad_norm": 0.3741483253809102, + "learning_rate": 1.3036126799671734e-06, + "loss": 0.3911, + "step": 5684 + }, + { + "epoch": 3.9424410540915398, + "grad_norm": 0.380714868716621, + "learning_rate": 1.301983007317743e-06, + "loss": 0.3652, + "step": 5685 + }, + { + "epoch": 3.943134535367545, + "grad_norm": 0.386574857583611, + "learning_rate": 1.3003542014567156e-06, + "loss": 0.4137, + "step": 5686 + }, + { + "epoch": 3.9438280166435504, + "grad_norm": 0.3730005133971035, + "learning_rate": 1.2987262627658676e-06, + "loss": 0.3908, + "step": 5687 + }, + { + "epoch": 3.944521497919556, + "grad_norm": 0.38425886028701634, + "learning_rate": 1.2970991916267779e-06, + "loss": 0.3417, + "step": 5688 + }, + { + "epoch": 3.945214979195562, + "grad_norm": 0.4158896032084831, + "learning_rate": 1.2954729884208212e-06, + "loss": 0.431, + "step": 5689 + }, + { + "epoch": 3.9459084604715673, + "grad_norm": 0.5780251321763046, + "learning_rate": 1.293847653529165e-06, + "loss": 0.4322, + "step": 5690 + }, + { + "epoch": 3.9466019417475726, + "grad_norm": 0.4082134245329464, + "learning_rate": 1.2922231873327779e-06, + "loss": 0.3749, + "step": 5691 + }, + { + "epoch": 3.9472954230235784, + "grad_norm": 0.39724644301906575, + "learning_rate": 1.2905995902124242e-06, + "loss": 0.3916, + "step": 5692 + }, + { + "epoch": 3.947988904299584, + "grad_norm": 0.46077900169185376, + "learning_rate": 1.2889768625486588e-06, + "loss": 0.4387, + "step": 5693 + }, + { + "epoch": 3.9486823855755895, + "grad_norm": 0.3837876641922954, + "learning_rate": 1.287355004721843e-06, + "loss": 0.3935, + "step": 5694 + }, + { + "epoch": 3.949375866851595, + "grad_norm": 0.42749401495919626, + "learning_rate": 1.2857340171121246e-06, + "loss": 0.4038, + "step": 5695 + }, + { + "epoch": 3.9500693481276006, + "grad_norm": 0.40319675594681176, + "learning_rate": 1.2841139000994524e-06, + "loss": 0.4052, + "step": 5696 + }, + { + "epoch": 3.9507628294036063, + "grad_norm": 0.39906374254605154, + "learning_rate": 1.2824946540635725e-06, + "loss": 0.3877, + "step": 5697 + }, + { + "epoch": 3.9514563106796117, + "grad_norm": 0.5190401484270997, + "learning_rate": 1.28087627938402e-06, + "loss": 0.3673, + "step": 5698 + }, + { + "epoch": 3.952149791955617, + "grad_norm": 0.3896400631056696, + "learning_rate": 1.2792587764401343e-06, + "loss": 0.3508, + "step": 5699 + }, + { + "epoch": 3.9528432732316228, + "grad_norm": 0.44636566145299666, + "learning_rate": 1.2776421456110427e-06, + "loss": 0.4022, + "step": 5700 + }, + { + "epoch": 3.9535367545076285, + "grad_norm": 0.4054930476058675, + "learning_rate": 1.276026387275674e-06, + "loss": 0.3983, + "step": 5701 + }, + { + "epoch": 3.954230235783634, + "grad_norm": 0.41704845024078757, + "learning_rate": 1.2744115018127494e-06, + "loss": 0.406, + "step": 5702 + }, + { + "epoch": 3.954923717059639, + "grad_norm": 0.3559236138431055, + "learning_rate": 1.2727974896007871e-06, + "loss": 0.3662, + "step": 5703 + }, + { + "epoch": 3.955617198335645, + "grad_norm": 0.3939122948268484, + "learning_rate": 1.271184351018101e-06, + "loss": 0.3952, + "step": 5704 + }, + { + "epoch": 3.9563106796116507, + "grad_norm": 0.5070237755574002, + "learning_rate": 1.2695720864427963e-06, + "loss": 0.3815, + "step": 5705 + }, + { + "epoch": 3.957004160887656, + "grad_norm": 0.37621573730475083, + "learning_rate": 1.2679606962527774e-06, + "loss": 0.3958, + "step": 5706 + }, + { + "epoch": 3.9576976421636614, + "grad_norm": 0.3873877405344169, + "learning_rate": 1.2663501808257444e-06, + "loss": 0.3198, + "step": 5707 + }, + { + "epoch": 3.958391123439667, + "grad_norm": 0.40384457113927813, + "learning_rate": 1.2647405405391867e-06, + "loss": 0.3937, + "step": 5708 + }, + { + "epoch": 3.959084604715673, + "grad_norm": 0.5014507156443899, + "learning_rate": 1.2631317757703942e-06, + "loss": 0.4072, + "step": 5709 + }, + { + "epoch": 3.9597780859916782, + "grad_norm": 0.37142258371238035, + "learning_rate": 1.261523886896452e-06, + "loss": 0.3422, + "step": 5710 + }, + { + "epoch": 3.9604715672676836, + "grad_norm": 0.5402782594966292, + "learning_rate": 1.259916874294232e-06, + "loss": 0.4034, + "step": 5711 + }, + { + "epoch": 3.9611650485436893, + "grad_norm": 0.4056859169048281, + "learning_rate": 1.2583107383404125e-06, + "loss": 0.3558, + "step": 5712 + }, + { + "epoch": 3.961858529819695, + "grad_norm": 0.38955498094338076, + "learning_rate": 1.2567054794114558e-06, + "loss": 0.3587, + "step": 5713 + }, + { + "epoch": 3.9625520110957004, + "grad_norm": 0.38933492535119524, + "learning_rate": 1.2551010978836247e-06, + "loss": 0.387, + "step": 5714 + }, + { + "epoch": 3.9632454923717058, + "grad_norm": 0.389806625283031, + "learning_rate": 1.2534975941329758e-06, + "loss": 0.3761, + "step": 5715 + }, + { + "epoch": 3.9639389736477115, + "grad_norm": 3.454146150584736, + "learning_rate": 1.251894968535356e-06, + "loss": 0.3615, + "step": 5716 + }, + { + "epoch": 3.9646324549237173, + "grad_norm": 0.3867506930870249, + "learning_rate": 1.250293221466411e-06, + "loss": 0.3645, + "step": 5717 + }, + { + "epoch": 3.9653259361997226, + "grad_norm": 0.41817483163562436, + "learning_rate": 1.2486923533015788e-06, + "loss": 0.3917, + "step": 5718 + }, + { + "epoch": 3.966019417475728, + "grad_norm": 0.4368784443128529, + "learning_rate": 1.2470923644160898e-06, + "loss": 0.3642, + "step": 5719 + }, + { + "epoch": 3.9667128987517337, + "grad_norm": 0.4413408560547274, + "learning_rate": 1.2454932551849708e-06, + "loss": 0.3581, + "step": 5720 + }, + { + "epoch": 3.9674063800277395, + "grad_norm": 0.4303083673464683, + "learning_rate": 1.2438950259830412e-06, + "loss": 0.3489, + "step": 5721 + }, + { + "epoch": 3.968099861303745, + "grad_norm": 0.41365343058986725, + "learning_rate": 1.2422976771849144e-06, + "loss": 0.3979, + "step": 5722 + }, + { + "epoch": 3.96879334257975, + "grad_norm": 0.4046215559080561, + "learning_rate": 1.2407012091649996e-06, + "loss": 0.3596, + "step": 5723 + }, + { + "epoch": 3.969486823855756, + "grad_norm": 0.42790200349990853, + "learning_rate": 1.2391056222974928e-06, + "loss": 0.3709, + "step": 5724 + }, + { + "epoch": 3.9701803051317617, + "grad_norm": 0.40952428765560905, + "learning_rate": 1.2375109169563915e-06, + "loss": 0.3867, + "step": 5725 + }, + { + "epoch": 3.970873786407767, + "grad_norm": 0.44226633604353166, + "learning_rate": 1.235917093515483e-06, + "loss": 0.3773, + "step": 5726 + }, + { + "epoch": 3.9715672676837723, + "grad_norm": 0.37553802642405126, + "learning_rate": 1.2343241523483452e-06, + "loss": 0.3348, + "step": 5727 + }, + { + "epoch": 3.972260748959778, + "grad_norm": 0.39257206808193185, + "learning_rate": 1.2327320938283543e-06, + "loss": 0.3726, + "step": 5728 + }, + { + "epoch": 3.972954230235784, + "grad_norm": 0.393711723909782, + "learning_rate": 1.2311409183286765e-06, + "loss": 0.3728, + "step": 5729 + }, + { + "epoch": 3.973647711511789, + "grad_norm": 0.4296746345286358, + "learning_rate": 1.2295506262222723e-06, + "loss": 0.4195, + "step": 5730 + }, + { + "epoch": 3.9743411927877945, + "grad_norm": 0.41507157274395867, + "learning_rate": 1.2279612178818955e-06, + "loss": 0.3618, + "step": 5731 + }, + { + "epoch": 3.9750346740638003, + "grad_norm": 0.38761962885321694, + "learning_rate": 1.2263726936800895e-06, + "loss": 0.371, + "step": 5732 + }, + { + "epoch": 3.975728155339806, + "grad_norm": 0.393850569894318, + "learning_rate": 1.2247850539891947e-06, + "loss": 0.3788, + "step": 5733 + }, + { + "epoch": 3.9764216366158114, + "grad_norm": 0.6760980892034429, + "learning_rate": 1.2231982991813428e-06, + "loss": 0.3643, + "step": 5734 + }, + { + "epoch": 3.9771151178918167, + "grad_norm": 0.4030177735429908, + "learning_rate": 1.2216124296284554e-06, + "loss": 0.3996, + "step": 5735 + }, + { + "epoch": 3.9778085991678225, + "grad_norm": 0.5547147831236997, + "learning_rate": 1.2200274457022503e-06, + "loss": 0.3668, + "step": 5736 + }, + { + "epoch": 3.9785020804438282, + "grad_norm": 0.39146584408378815, + "learning_rate": 1.2184433477742375e-06, + "loss": 0.3693, + "step": 5737 + }, + { + "epoch": 3.9791955617198336, + "grad_norm": 0.3911153887592586, + "learning_rate": 1.2168601362157134e-06, + "loss": 0.38, + "step": 5738 + }, + { + "epoch": 3.979889042995839, + "grad_norm": 0.41342194857756964, + "learning_rate": 1.2152778113977776e-06, + "loss": 0.4073, + "step": 5739 + }, + { + "epoch": 3.9805825242718447, + "grad_norm": 0.38444966834365457, + "learning_rate": 1.2136963736913117e-06, + "loss": 0.3887, + "step": 5740 + }, + { + "epoch": 3.9812760055478504, + "grad_norm": 0.42775677441910975, + "learning_rate": 1.2121158234669933e-06, + "loss": 0.3941, + "step": 5741 + }, + { + "epoch": 3.9819694868238558, + "grad_norm": 0.40064894573830545, + "learning_rate": 1.210536161095295e-06, + "loss": 0.4037, + "step": 5742 + }, + { + "epoch": 3.982662968099861, + "grad_norm": 0.4604868136975249, + "learning_rate": 1.2089573869464738e-06, + "loss": 0.4009, + "step": 5743 + }, + { + "epoch": 3.983356449375867, + "grad_norm": 0.38853712077021296, + "learning_rate": 1.2073795013905865e-06, + "loss": 0.3396, + "step": 5744 + }, + { + "epoch": 3.9840499306518726, + "grad_norm": 0.4119203939330921, + "learning_rate": 1.2058025047974753e-06, + "loss": 0.3719, + "step": 5745 + }, + { + "epoch": 3.984743411927878, + "grad_norm": 0.3683332161664147, + "learning_rate": 1.2042263975367785e-06, + "loss": 0.3709, + "step": 5746 + }, + { + "epoch": 3.9854368932038833, + "grad_norm": 0.43071177651480674, + "learning_rate": 1.2026511799779234e-06, + "loss": 0.4027, + "step": 5747 + }, + { + "epoch": 3.986130374479889, + "grad_norm": 0.41169056270401494, + "learning_rate": 1.20107685249013e-06, + "loss": 0.3624, + "step": 5748 + }, + { + "epoch": 3.986823855755895, + "grad_norm": 0.4329112779834001, + "learning_rate": 1.1995034154424111e-06, + "loss": 0.3882, + "step": 5749 + }, + { + "epoch": 3.9875173370319, + "grad_norm": 0.3797986135112781, + "learning_rate": 1.1979308692035658e-06, + "loss": 0.4042, + "step": 5750 + }, + { + "epoch": 3.9882108183079055, + "grad_norm": 0.3960584676111404, + "learning_rate": 1.1963592141421882e-06, + "loss": 0.3884, + "step": 5751 + }, + { + "epoch": 3.9889042995839112, + "grad_norm": 0.4180044032794779, + "learning_rate": 1.1947884506266655e-06, + "loss": 0.3754, + "step": 5752 + }, + { + "epoch": 3.989597780859917, + "grad_norm": 0.3824105358910305, + "learning_rate": 1.1932185790251698e-06, + "loss": 0.3716, + "step": 5753 + }, + { + "epoch": 3.9902912621359223, + "grad_norm": 0.5620414926778071, + "learning_rate": 1.1916495997056693e-06, + "loss": 0.4353, + "step": 5754 + }, + { + "epoch": 3.9909847434119277, + "grad_norm": 0.41368148127812576, + "learning_rate": 1.1900815130359223e-06, + "loss": 0.4048, + "step": 5755 + }, + { + "epoch": 3.9916782246879334, + "grad_norm": 0.4418519608109611, + "learning_rate": 1.1885143193834735e-06, + "loss": 0.3507, + "step": 5756 + }, + { + "epoch": 3.992371705963939, + "grad_norm": 0.4403404777799451, + "learning_rate": 1.1869480191156668e-06, + "loss": 0.4034, + "step": 5757 + }, + { + "epoch": 3.9930651872399445, + "grad_norm": 0.4105981298795642, + "learning_rate": 1.1853826125996277e-06, + "loss": 0.3473, + "step": 5758 + }, + { + "epoch": 3.99375866851595, + "grad_norm": 0.4595860775755933, + "learning_rate": 1.183818100202277e-06, + "loss": 0.3586, + "step": 5759 + }, + { + "epoch": 3.9944521497919556, + "grad_norm": 0.38993742758248, + "learning_rate": 1.1822544822903275e-06, + "loss": 0.358, + "step": 5760 + }, + { + "epoch": 3.9951456310679614, + "grad_norm": 0.37988196429486354, + "learning_rate": 1.1806917592302763e-06, + "loss": 0.3809, + "step": 5761 + }, + { + "epoch": 3.9958391123439667, + "grad_norm": 0.40003527774922076, + "learning_rate": 1.1791299313884158e-06, + "loss": 0.4013, + "step": 5762 + }, + { + "epoch": 3.996532593619972, + "grad_norm": 0.362320198547181, + "learning_rate": 1.1775689991308292e-06, + "loss": 0.3186, + "step": 5763 + }, + { + "epoch": 3.997226074895978, + "grad_norm": 0.4375356344905873, + "learning_rate": 1.176008962823384e-06, + "loss": 0.3893, + "step": 5764 + }, + { + "epoch": 3.9979195561719836, + "grad_norm": 0.984875063636234, + "learning_rate": 1.1744498228317436e-06, + "loss": 0.3639, + "step": 5765 + }, + { + "epoch": 3.998613037447989, + "grad_norm": 0.39050098862356036, + "learning_rate": 1.1728915795213586e-06, + "loss": 0.376, + "step": 5766 + }, + { + "epoch": 3.9993065187239942, + "grad_norm": 0.352767593457289, + "learning_rate": 1.1713342332574702e-06, + "loss": 0.3783, + "step": 5767 + }, + { + "epoch": 4.0, + "grad_norm": 0.41523151464963964, + "learning_rate": 1.1697777844051105e-06, + "loss": 0.3621, + "step": 5768 + }, + { + "epoch": 4.000693481276006, + "grad_norm": 0.44340135856638263, + "learning_rate": 1.168222233329097e-06, + "loss": 0.3281, + "step": 5769 + }, + { + "epoch": 4.0013869625520115, + "grad_norm": 0.37844197093984355, + "learning_rate": 1.166667580394041e-06, + "loss": 0.35, + "step": 5770 + }, + { + "epoch": 4.002080443828016, + "grad_norm": 0.3888541375827065, + "learning_rate": 1.165113825964343e-06, + "loss": 0.2985, + "step": 5771 + }, + { + "epoch": 4.002773925104022, + "grad_norm": 0.4093681633758278, + "learning_rate": 1.1635609704041896e-06, + "loss": 0.3409, + "step": 5772 + }, + { + "epoch": 4.003467406380028, + "grad_norm": 0.38915627815727133, + "learning_rate": 1.1620090140775598e-06, + "loss": 0.3878, + "step": 5773 + }, + { + "epoch": 4.004160887656034, + "grad_norm": 0.36655796389465645, + "learning_rate": 1.1604579573482205e-06, + "loss": 0.3688, + "step": 5774 + }, + { + "epoch": 4.004854368932039, + "grad_norm": 0.38891063951151855, + "learning_rate": 1.1589078005797294e-06, + "loss": 0.3708, + "step": 5775 + }, + { + "epoch": 4.005547850208044, + "grad_norm": 0.37569681015935014, + "learning_rate": 1.1573585441354324e-06, + "loss": 0.3471, + "step": 5776 + }, + { + "epoch": 4.00624133148405, + "grad_norm": 0.3830565705662529, + "learning_rate": 1.1558101883784616e-06, + "loss": 0.3802, + "step": 5777 + }, + { + "epoch": 4.006934812760056, + "grad_norm": 0.40636187669279616, + "learning_rate": 1.1542627336717422e-06, + "loss": 0.3797, + "step": 5778 + }, + { + "epoch": 4.007628294036061, + "grad_norm": 0.42411507237677165, + "learning_rate": 1.1527161803779868e-06, + "loss": 0.3936, + "step": 5779 + }, + { + "epoch": 4.008321775312067, + "grad_norm": 0.38796922795575917, + "learning_rate": 1.151170528859694e-06, + "loss": 0.307, + "step": 5780 + }, + { + "epoch": 4.009015256588072, + "grad_norm": 0.38224374417128243, + "learning_rate": 1.149625779479156e-06, + "loss": 0.3419, + "step": 5781 + }, + { + "epoch": 4.009708737864078, + "grad_norm": 0.37525329652006423, + "learning_rate": 1.1480819325984489e-06, + "loss": 0.344, + "step": 5782 + }, + { + "epoch": 4.010402219140083, + "grad_norm": 0.4250747270169149, + "learning_rate": 1.1465389885794376e-06, + "loss": 0.402, + "step": 5783 + }, + { + "epoch": 4.011095700416089, + "grad_norm": 0.46700080359458773, + "learning_rate": 1.1449969477837825e-06, + "loss": 0.3944, + "step": 5784 + }, + { + "epoch": 4.0117891816920945, + "grad_norm": 0.36829362749736344, + "learning_rate": 1.143455810572922e-06, + "loss": 0.3254, + "step": 5785 + }, + { + "epoch": 4.0124826629681, + "grad_norm": 0.43814893513893954, + "learning_rate": 1.1419155773080893e-06, + "loss": 0.3457, + "step": 5786 + }, + { + "epoch": 4.013176144244105, + "grad_norm": 0.38702579584264457, + "learning_rate": 1.140376248350305e-06, + "loss": 0.3641, + "step": 5787 + }, + { + "epoch": 4.013869625520111, + "grad_norm": 0.38462620213273563, + "learning_rate": 1.1388378240603742e-06, + "loss": 0.3292, + "step": 5788 + }, + { + "epoch": 4.014563106796117, + "grad_norm": 0.41151676921839425, + "learning_rate": 1.1373003047988952e-06, + "loss": 0.3326, + "step": 5789 + }, + { + "epoch": 4.0152565880721225, + "grad_norm": 0.3810523645944822, + "learning_rate": 1.1357636909262477e-06, + "loss": 0.3398, + "step": 5790 + }, + { + "epoch": 4.015950069348127, + "grad_norm": 0.4253137185116376, + "learning_rate": 1.1342279828026054e-06, + "loss": 0.3407, + "step": 5791 + }, + { + "epoch": 4.016643550624133, + "grad_norm": 0.5676880788011671, + "learning_rate": 1.1326931807879266e-06, + "loss": 0.3715, + "step": 5792 + }, + { + "epoch": 4.017337031900139, + "grad_norm": 0.42492385061892185, + "learning_rate": 1.1311592852419574e-06, + "loss": 0.3416, + "step": 5793 + }, + { + "epoch": 4.018030513176145, + "grad_norm": 0.4274375983234086, + "learning_rate": 1.1296262965242345e-06, + "loss": 0.3459, + "step": 5794 + }, + { + "epoch": 4.01872399445215, + "grad_norm": 0.425841892769472, + "learning_rate": 1.128094214994075e-06, + "loss": 0.3563, + "step": 5795 + }, + { + "epoch": 4.019417475728155, + "grad_norm": 0.4009974210815654, + "learning_rate": 1.1265630410105905e-06, + "loss": 0.3872, + "step": 5796 + }, + { + "epoch": 4.020110957004161, + "grad_norm": 0.37085478252614934, + "learning_rate": 1.1250327749326772e-06, + "loss": 0.3191, + "step": 5797 + }, + { + "epoch": 4.020804438280167, + "grad_norm": 0.41412108196268654, + "learning_rate": 1.1235034171190167e-06, + "loss": 0.3312, + "step": 5798 + }, + { + "epoch": 4.021497919556172, + "grad_norm": 0.3904627089081514, + "learning_rate": 1.12197496792808e-06, + "loss": 0.3492, + "step": 5799 + }, + { + "epoch": 4.0221914008321775, + "grad_norm": 0.7116073319918248, + "learning_rate": 1.1204474277181265e-06, + "loss": 0.3642, + "step": 5800 + }, + { + "epoch": 4.022884882108183, + "grad_norm": 0.38217680849421404, + "learning_rate": 1.118920796847196e-06, + "loss": 0.3412, + "step": 5801 + }, + { + "epoch": 4.023578363384189, + "grad_norm": 0.39098852916806814, + "learning_rate": 1.1173950756731256e-06, + "loss": 0.3271, + "step": 5802 + }, + { + "epoch": 4.024271844660194, + "grad_norm": 0.4464142485224417, + "learning_rate": 1.1158702645535285e-06, + "loss": 0.3637, + "step": 5803 + }, + { + "epoch": 4.0249653259362, + "grad_norm": 0.4173176453800928, + "learning_rate": 1.1143463638458113e-06, + "loss": 0.3696, + "step": 5804 + }, + { + "epoch": 4.0256588072122055, + "grad_norm": 0.4913713837177101, + "learning_rate": 1.112823373907167e-06, + "loss": 0.3638, + "step": 5805 + }, + { + "epoch": 4.026352288488211, + "grad_norm": 0.42034173246720674, + "learning_rate": 1.1113012950945695e-06, + "loss": 0.3667, + "step": 5806 + }, + { + "epoch": 4.027045769764216, + "grad_norm": 0.35948903960334144, + "learning_rate": 1.1097801277647858e-06, + "loss": 0.308, + "step": 5807 + }, + { + "epoch": 4.027739251040222, + "grad_norm": 0.40789981807820574, + "learning_rate": 1.108259872274367e-06, + "loss": 0.3499, + "step": 5808 + }, + { + "epoch": 4.028432732316228, + "grad_norm": 0.4066987525319391, + "learning_rate": 1.1067405289796474e-06, + "loss": 0.3539, + "step": 5809 + }, + { + "epoch": 4.029126213592233, + "grad_norm": 0.39672304986182455, + "learning_rate": 1.1052220982367518e-06, + "loss": 0.3452, + "step": 5810 + }, + { + "epoch": 4.029819694868238, + "grad_norm": 0.454528790508705, + "learning_rate": 1.103704580401589e-06, + "loss": 0.3611, + "step": 5811 + }, + { + "epoch": 4.030513176144244, + "grad_norm": 0.39579841288328005, + "learning_rate": 1.1021879758298538e-06, + "loss": 0.3343, + "step": 5812 + }, + { + "epoch": 4.03120665742025, + "grad_norm": 0.44487426916818584, + "learning_rate": 1.1006722848770295e-06, + "loss": 0.361, + "step": 5813 + }, + { + "epoch": 4.031900138696256, + "grad_norm": 0.43506610552830605, + "learning_rate": 1.0991575078983802e-06, + "loss": 0.3671, + "step": 5814 + }, + { + "epoch": 4.0325936199722605, + "grad_norm": 0.38249264145269624, + "learning_rate": 1.0976436452489592e-06, + "loss": 0.3084, + "step": 5815 + }, + { + "epoch": 4.033287101248266, + "grad_norm": 0.41508076013418466, + "learning_rate": 1.0961306972836079e-06, + "loss": 0.336, + "step": 5816 + }, + { + "epoch": 4.033980582524272, + "grad_norm": 0.38701935717543295, + "learning_rate": 1.0946186643569456e-06, + "loss": 0.3503, + "step": 5817 + }, + { + "epoch": 4.034674063800278, + "grad_norm": 0.4519688878270693, + "learning_rate": 1.0931075468233849e-06, + "loss": 0.3409, + "step": 5818 + }, + { + "epoch": 4.035367545076283, + "grad_norm": 0.41867670357078235, + "learning_rate": 1.0915973450371198e-06, + "loss": 0.3737, + "step": 5819 + }, + { + "epoch": 4.0360610263522885, + "grad_norm": 0.5001610437351757, + "learning_rate": 1.0900880593521312e-06, + "loss": 0.3505, + "step": 5820 + }, + { + "epoch": 4.036754507628294, + "grad_norm": 0.40845155003360933, + "learning_rate": 1.0885796901221863e-06, + "loss": 0.3388, + "step": 5821 + }, + { + "epoch": 4.0374479889043, + "grad_norm": 0.40410482736945486, + "learning_rate": 1.0870722377008324e-06, + "loss": 0.3463, + "step": 5822 + }, + { + "epoch": 4.038141470180305, + "grad_norm": 0.42609479627173835, + "learning_rate": 1.0855657024414074e-06, + "loss": 0.3836, + "step": 5823 + }, + { + "epoch": 4.038834951456311, + "grad_norm": 0.4221382488108252, + "learning_rate": 1.0840600846970333e-06, + "loss": 0.3815, + "step": 5824 + }, + { + "epoch": 4.039528432732316, + "grad_norm": 0.4192382643715479, + "learning_rate": 1.0825553848206133e-06, + "loss": 0.3773, + "step": 5825 + }, + { + "epoch": 4.040221914008322, + "grad_norm": 0.3643725487085566, + "learning_rate": 1.0810516031648415e-06, + "loss": 0.3345, + "step": 5826 + }, + { + "epoch": 4.040915395284327, + "grad_norm": 0.4376391868602908, + "learning_rate": 1.0795487400821897e-06, + "loss": 0.3359, + "step": 5827 + }, + { + "epoch": 4.041608876560333, + "grad_norm": 0.4169450078097028, + "learning_rate": 1.078046795924919e-06, + "loss": 0.3166, + "step": 5828 + }, + { + "epoch": 4.042302357836339, + "grad_norm": 0.42587114592007663, + "learning_rate": 1.0765457710450777e-06, + "loss": 0.3008, + "step": 5829 + }, + { + "epoch": 4.042995839112344, + "grad_norm": 0.38854199686484153, + "learning_rate": 1.0750456657944913e-06, + "loss": 0.3545, + "step": 5830 + }, + { + "epoch": 4.043689320388349, + "grad_norm": 0.3964140826905079, + "learning_rate": 1.0735464805247763e-06, + "loss": 0.3201, + "step": 5831 + }, + { + "epoch": 4.044382801664355, + "grad_norm": 0.38700042933700374, + "learning_rate": 1.0720482155873286e-06, + "loss": 0.3495, + "step": 5832 + }, + { + "epoch": 4.045076282940361, + "grad_norm": 0.3654272643891456, + "learning_rate": 1.0705508713333313e-06, + "loss": 0.3343, + "step": 5833 + }, + { + "epoch": 4.045769764216367, + "grad_norm": 0.42851269987260987, + "learning_rate": 1.0690544481137527e-06, + "loss": 0.3717, + "step": 5834 + }, + { + "epoch": 4.0464632454923715, + "grad_norm": 0.4321138900128039, + "learning_rate": 1.0675589462793406e-06, + "loss": 0.3158, + "step": 5835 + }, + { + "epoch": 4.047156726768377, + "grad_norm": 0.3930406818219885, + "learning_rate": 1.0660643661806319e-06, + "loss": 0.3426, + "step": 5836 + }, + { + "epoch": 4.047850208044383, + "grad_norm": 0.44830600167565265, + "learning_rate": 1.0645707081679446e-06, + "loss": 0.339, + "step": 5837 + }, + { + "epoch": 4.048543689320389, + "grad_norm": 0.4348642546430056, + "learning_rate": 1.063077972591382e-06, + "loss": 0.3435, + "step": 5838 + }, + { + "epoch": 4.049237170596394, + "grad_norm": 0.4027083954062146, + "learning_rate": 1.061586159800831e-06, + "loss": 0.3635, + "step": 5839 + }, + { + "epoch": 4.049930651872399, + "grad_norm": 0.41830438917877455, + "learning_rate": 1.0600952701459595e-06, + "loss": 0.4183, + "step": 5840 + }, + { + "epoch": 4.050624133148405, + "grad_norm": 0.39023486938742546, + "learning_rate": 1.0586053039762228e-06, + "loss": 0.3425, + "step": 5841 + }, + { + "epoch": 4.051317614424411, + "grad_norm": 0.4523300636524953, + "learning_rate": 1.0571162616408586e-06, + "loss": 0.3569, + "step": 5842 + }, + { + "epoch": 4.052011095700416, + "grad_norm": 0.45787467512678826, + "learning_rate": 1.0556281434888865e-06, + "loss": 0.3735, + "step": 5843 + }, + { + "epoch": 4.052704576976422, + "grad_norm": 0.3753388491079622, + "learning_rate": 1.0541409498691109e-06, + "loss": 0.3019, + "step": 5844 + }, + { + "epoch": 4.053398058252427, + "grad_norm": 0.4100187041060469, + "learning_rate": 1.0526546811301203e-06, + "loss": 0.3652, + "step": 5845 + }, + { + "epoch": 4.054091539528433, + "grad_norm": 0.3889532168033599, + "learning_rate": 1.051169337620282e-06, + "loss": 0.3368, + "step": 5846 + }, + { + "epoch": 4.054785020804438, + "grad_norm": 0.4226088476714647, + "learning_rate": 1.0496849196877545e-06, + "loss": 0.3572, + "step": 5847 + }, + { + "epoch": 4.055478502080444, + "grad_norm": 0.42250312136606627, + "learning_rate": 1.0482014276804713e-06, + "loss": 0.3532, + "step": 5848 + }, + { + "epoch": 4.05617198335645, + "grad_norm": 0.3852870358893622, + "learning_rate": 1.0467188619461532e-06, + "loss": 0.3477, + "step": 5849 + }, + { + "epoch": 4.056865464632455, + "grad_norm": 0.39866227735554816, + "learning_rate": 1.0452372228323044e-06, + "loss": 0.3189, + "step": 5850 + }, + { + "epoch": 4.05755894590846, + "grad_norm": 0.3843644290783127, + "learning_rate": 1.0437565106862075e-06, + "loss": 0.3868, + "step": 5851 + }, + { + "epoch": 4.058252427184466, + "grad_norm": 0.3615829358056039, + "learning_rate": 1.0422767258549317e-06, + "loss": 0.323, + "step": 5852 + }, + { + "epoch": 4.058945908460472, + "grad_norm": 0.4439796019335204, + "learning_rate": 1.0407978686853299e-06, + "loss": 0.3447, + "step": 5853 + }, + { + "epoch": 4.0596393897364775, + "grad_norm": 0.3768287003413617, + "learning_rate": 1.0393199395240317e-06, + "loss": 0.3475, + "step": 5854 + }, + { + "epoch": 4.060332871012482, + "grad_norm": 0.503640439688754, + "learning_rate": 1.037842938717456e-06, + "loss": 0.3474, + "step": 5855 + }, + { + "epoch": 4.061026352288488, + "grad_norm": 0.49567142797013075, + "learning_rate": 1.0363668666117992e-06, + "loss": 0.3005, + "step": 5856 + }, + { + "epoch": 4.061719833564494, + "grad_norm": 0.37315626340508873, + "learning_rate": 1.0348917235530437e-06, + "loss": 0.3179, + "step": 5857 + }, + { + "epoch": 4.0624133148405, + "grad_norm": 0.4498268830274607, + "learning_rate": 1.0334175098869526e-06, + "loss": 0.3839, + "step": 5858 + }, + { + "epoch": 4.063106796116505, + "grad_norm": 0.40218182598713376, + "learning_rate": 1.0319442259590683e-06, + "loss": 0.3357, + "step": 5859 + }, + { + "epoch": 4.06380027739251, + "grad_norm": 0.4095507314444169, + "learning_rate": 1.03047187211472e-06, + "loss": 0.3736, + "step": 5860 + }, + { + "epoch": 4.064493758668516, + "grad_norm": 0.37354967270115313, + "learning_rate": 1.0290004486990169e-06, + "loss": 0.3463, + "step": 5861 + }, + { + "epoch": 4.065187239944522, + "grad_norm": 0.38827261263086965, + "learning_rate": 1.0275299560568486e-06, + "loss": 0.3524, + "step": 5862 + }, + { + "epoch": 4.065880721220527, + "grad_norm": 0.39194774468101096, + "learning_rate": 1.0260603945328878e-06, + "loss": 0.3647, + "step": 5863 + }, + { + "epoch": 4.066574202496533, + "grad_norm": 0.37407480909251034, + "learning_rate": 1.0245917644715908e-06, + "loss": 0.3454, + "step": 5864 + }, + { + "epoch": 4.067267683772538, + "grad_norm": 0.3945946671705303, + "learning_rate": 1.023124066217192e-06, + "loss": 0.3709, + "step": 5865 + }, + { + "epoch": 4.067961165048544, + "grad_norm": 0.40054137089556113, + "learning_rate": 1.0216573001137125e-06, + "loss": 0.3764, + "step": 5866 + }, + { + "epoch": 4.068654646324549, + "grad_norm": 0.3874714076684071, + "learning_rate": 1.0201914665049472e-06, + "loss": 0.3226, + "step": 5867 + }, + { + "epoch": 4.069348127600555, + "grad_norm": 0.4331782843327501, + "learning_rate": 1.0187265657344796e-06, + "loss": 0.4019, + "step": 5868 + }, + { + "epoch": 4.0700416088765605, + "grad_norm": 0.4302859450348201, + "learning_rate": 1.0172625981456723e-06, + "loss": 0.3213, + "step": 5869 + }, + { + "epoch": 4.070735090152566, + "grad_norm": 0.38244038205125636, + "learning_rate": 1.0157995640816665e-06, + "loss": 0.3849, + "step": 5870 + }, + { + "epoch": 4.071428571428571, + "grad_norm": 0.38936163775891824, + "learning_rate": 1.0143374638853892e-06, + "loss": 0.3651, + "step": 5871 + }, + { + "epoch": 4.072122052704577, + "grad_norm": 0.40641703382552175, + "learning_rate": 1.0128762978995422e-06, + "loss": 0.3693, + "step": 5872 + }, + { + "epoch": 4.072815533980583, + "grad_norm": 0.46322072472640285, + "learning_rate": 1.0114160664666156e-06, + "loss": 0.348, + "step": 5873 + }, + { + "epoch": 4.0735090152565885, + "grad_norm": 0.41789040908115627, + "learning_rate": 1.0099567699288786e-06, + "loss": 0.3719, + "step": 5874 + }, + { + "epoch": 4.074202496532593, + "grad_norm": 0.4272050712727168, + "learning_rate": 1.0084984086283755e-06, + "loss": 0.3652, + "step": 5875 + }, + { + "epoch": 4.074895977808599, + "grad_norm": 0.37398271118769805, + "learning_rate": 1.0070409829069394e-06, + "loss": 0.3402, + "step": 5876 + }, + { + "epoch": 4.075589459084605, + "grad_norm": 0.40278854472036707, + "learning_rate": 1.005584493106177e-06, + "loss": 0.3741, + "step": 5877 + }, + { + "epoch": 4.076282940360611, + "grad_norm": 0.40458204526559793, + "learning_rate": 1.0041289395674802e-06, + "loss": 0.332, + "step": 5878 + }, + { + "epoch": 4.0769764216366156, + "grad_norm": 0.4404472793691462, + "learning_rate": 1.0026743226320223e-06, + "loss": 0.3343, + "step": 5879 + }, + { + "epoch": 4.077669902912621, + "grad_norm": 0.45093009069115725, + "learning_rate": 1.0012206426407518e-06, + "loss": 0.346, + "step": 5880 + }, + { + "epoch": 4.078363384188627, + "grad_norm": 0.3908810592958925, + "learning_rate": 9.99767899934402e-07, + "loss": 0.3303, + "step": 5881 + }, + { + "epoch": 4.079056865464633, + "grad_norm": 0.38634414914175186, + "learning_rate": 9.983160948534854e-07, + "loss": 0.35, + "step": 5882 + }, + { + "epoch": 4.079750346740638, + "grad_norm": 0.4354320078855466, + "learning_rate": 9.968652277382946e-07, + "loss": 0.3374, + "step": 5883 + }, + { + "epoch": 4.0804438280166435, + "grad_norm": 0.529310444422906, + "learning_rate": 9.95415298928904e-07, + "loss": 0.3336, + "step": 5884 + }, + { + "epoch": 4.081137309292649, + "grad_norm": 0.40194326137499664, + "learning_rate": 9.93966308765163e-07, + "loss": 0.3417, + "step": 5885 + }, + { + "epoch": 4.081830790568655, + "grad_norm": 0.6448066822501168, + "learning_rate": 9.925182575867066e-07, + "loss": 0.4051, + "step": 5886 + }, + { + "epoch": 4.08252427184466, + "grad_norm": 0.9929883512578793, + "learning_rate": 9.91071145732948e-07, + "loss": 0.3598, + "step": 5887 + }, + { + "epoch": 4.083217753120666, + "grad_norm": 0.41716032255389357, + "learning_rate": 9.896249735430774e-07, + "loss": 0.3477, + "step": 5888 + }, + { + "epoch": 4.0839112343966715, + "grad_norm": 0.384175744172019, + "learning_rate": 9.88179741356069e-07, + "loss": 0.309, + "step": 5889 + }, + { + "epoch": 4.084604715672677, + "grad_norm": 0.4018504510083021, + "learning_rate": 9.86735449510674e-07, + "loss": 0.3624, + "step": 5890 + }, + { + "epoch": 4.085298196948682, + "grad_norm": 0.41067440319039933, + "learning_rate": 9.852920983454239e-07, + "loss": 0.3689, + "step": 5891 + }, + { + "epoch": 4.085991678224688, + "grad_norm": 0.3958140720628448, + "learning_rate": 9.83849688198632e-07, + "loss": 0.328, + "step": 5892 + }, + { + "epoch": 4.086685159500694, + "grad_norm": 0.36029107973716085, + "learning_rate": 9.82408219408385e-07, + "loss": 0.3096, + "step": 5893 + }, + { + "epoch": 4.087378640776699, + "grad_norm": 0.4065999334232476, + "learning_rate": 9.809676923125549e-07, + "loss": 0.3268, + "step": 5894 + }, + { + "epoch": 4.088072122052704, + "grad_norm": 0.41157403666730424, + "learning_rate": 9.795281072487917e-07, + "loss": 0.3241, + "step": 5895 + }, + { + "epoch": 4.08876560332871, + "grad_norm": 0.5145356643195996, + "learning_rate": 9.780894645545215e-07, + "loss": 0.363, + "step": 5896 + }, + { + "epoch": 4.089459084604716, + "grad_norm": 0.3803748193319084, + "learning_rate": 9.76651764566952e-07, + "loss": 0.3753, + "step": 5897 + }, + { + "epoch": 4.090152565880722, + "grad_norm": 0.42946475021254504, + "learning_rate": 9.752150076230727e-07, + "loss": 0.3482, + "step": 5898 + }, + { + "epoch": 4.0908460471567265, + "grad_norm": 0.39938763817123146, + "learning_rate": 9.737791940596436e-07, + "loss": 0.319, + "step": 5899 + }, + { + "epoch": 4.091539528432732, + "grad_norm": 0.42568017656411583, + "learning_rate": 9.723443242132152e-07, + "loss": 0.3413, + "step": 5900 + }, + { + "epoch": 4.092233009708738, + "grad_norm": 0.38800643997655987, + "learning_rate": 9.709103984201058e-07, + "loss": 0.36, + "step": 5901 + }, + { + "epoch": 4.092926490984744, + "grad_norm": 0.3792710196069069, + "learning_rate": 9.69477417016419e-07, + "loss": 0.3158, + "step": 5902 + }, + { + "epoch": 4.093619972260749, + "grad_norm": 0.4367420226668608, + "learning_rate": 9.680453803380368e-07, + "loss": 0.3745, + "step": 5903 + }, + { + "epoch": 4.0943134535367545, + "grad_norm": 0.4136972954177396, + "learning_rate": 9.666142887206153e-07, + "loss": 0.3122, + "step": 5904 + }, + { + "epoch": 4.09500693481276, + "grad_norm": 0.4449547954714318, + "learning_rate": 9.651841424995933e-07, + "loss": 0.3539, + "step": 5905 + }, + { + "epoch": 4.095700416088766, + "grad_norm": 0.4268763192142163, + "learning_rate": 9.637549420101877e-07, + "loss": 0.3224, + "step": 5906 + }, + { + "epoch": 4.096393897364771, + "grad_norm": 0.4298972184021089, + "learning_rate": 9.62326687587391e-07, + "loss": 0.3361, + "step": 5907 + }, + { + "epoch": 4.097087378640777, + "grad_norm": 0.3866986165135999, + "learning_rate": 9.608993795659766e-07, + "loss": 0.3259, + "step": 5908 + }, + { + "epoch": 4.097780859916782, + "grad_norm": 0.43035224091724156, + "learning_rate": 9.59473018280495e-07, + "loss": 0.3551, + "step": 5909 + }, + { + "epoch": 4.098474341192788, + "grad_norm": 0.39659620986529964, + "learning_rate": 9.580476040652748e-07, + "loss": 0.3592, + "step": 5910 + }, + { + "epoch": 4.099167822468793, + "grad_norm": 0.3942190192841053, + "learning_rate": 9.566231372544244e-07, + "loss": 0.3405, + "step": 5911 + }, + { + "epoch": 4.099861303744799, + "grad_norm": 0.40724862990032207, + "learning_rate": 9.551996181818263e-07, + "loss": 0.3423, + "step": 5912 + }, + { + "epoch": 4.100554785020805, + "grad_norm": 0.4696136676485921, + "learning_rate": 9.53777047181143e-07, + "loss": 0.3506, + "step": 5913 + }, + { + "epoch": 4.10124826629681, + "grad_norm": 0.4474527847864227, + "learning_rate": 9.52355424585818e-07, + "loss": 0.3231, + "step": 5914 + }, + { + "epoch": 4.101941747572815, + "grad_norm": 0.40233172511560633, + "learning_rate": 9.50934750729065e-07, + "loss": 0.3463, + "step": 5915 + }, + { + "epoch": 4.102635228848821, + "grad_norm": 0.44204400797241866, + "learning_rate": 9.495150259438835e-07, + "loss": 0.3997, + "step": 5916 + }, + { + "epoch": 4.103328710124827, + "grad_norm": 0.3905413372346672, + "learning_rate": 9.48096250563042e-07, + "loss": 0.3351, + "step": 5917 + }, + { + "epoch": 4.104022191400833, + "grad_norm": 0.4214916449942419, + "learning_rate": 9.466784249190952e-07, + "loss": 0.3823, + "step": 5918 + }, + { + "epoch": 4.1047156726768375, + "grad_norm": 0.4693407755650625, + "learning_rate": 9.452615493443718e-07, + "loss": 0.4101, + "step": 5919 + }, + { + "epoch": 4.105409153952843, + "grad_norm": 0.4044415532348893, + "learning_rate": 9.438456241709742e-07, + "loss": 0.3501, + "step": 5920 + }, + { + "epoch": 4.106102635228849, + "grad_norm": 0.38990588957861816, + "learning_rate": 9.424306497307873e-07, + "loss": 0.3529, + "step": 5921 + }, + { + "epoch": 4.106796116504855, + "grad_norm": 0.3892388756390445, + "learning_rate": 9.410166263554687e-07, + "loss": 0.3555, + "step": 5922 + }, + { + "epoch": 4.10748959778086, + "grad_norm": 0.39392931065563647, + "learning_rate": 9.396035543764559e-07, + "loss": 0.3293, + "step": 5923 + }, + { + "epoch": 4.108183079056865, + "grad_norm": 0.4369203656868996, + "learning_rate": 9.381914341249648e-07, + "loss": 0.3242, + "step": 5924 + }, + { + "epoch": 4.108876560332871, + "grad_norm": 0.3794263858141677, + "learning_rate": 9.367802659319835e-07, + "loss": 0.3378, + "step": 5925 + }, + { + "epoch": 4.109570041608877, + "grad_norm": 0.40245901640539045, + "learning_rate": 9.353700501282803e-07, + "loss": 0.3622, + "step": 5926 + }, + { + "epoch": 4.110263522884882, + "grad_norm": 0.4093408644622792, + "learning_rate": 9.339607870444001e-07, + "loss": 0.3772, + "step": 5927 + }, + { + "epoch": 4.110957004160888, + "grad_norm": 0.4384772975164476, + "learning_rate": 9.325524770106637e-07, + "loss": 0.3555, + "step": 5928 + }, + { + "epoch": 4.111650485436893, + "grad_norm": 0.3961629506761391, + "learning_rate": 9.311451203571697e-07, + "loss": 0.3173, + "step": 5929 + }, + { + "epoch": 4.112343966712899, + "grad_norm": 0.39463723101452375, + "learning_rate": 9.297387174137912e-07, + "loss": 0.3583, + "step": 5930 + }, + { + "epoch": 4.113037447988904, + "grad_norm": 0.4836288365850371, + "learning_rate": 9.283332685101782e-07, + "loss": 0.3444, + "step": 5931 + }, + { + "epoch": 4.11373092926491, + "grad_norm": 0.40424304754832785, + "learning_rate": 9.269287739757604e-07, + "loss": 0.3742, + "step": 5932 + }, + { + "epoch": 4.114424410540916, + "grad_norm": 0.39021018133113516, + "learning_rate": 9.25525234139738e-07, + "loss": 0.3604, + "step": 5933 + }, + { + "epoch": 4.115117891816921, + "grad_norm": 0.4221221170168946, + "learning_rate": 9.241226493310917e-07, + "loss": 0.3373, + "step": 5934 + }, + { + "epoch": 4.115811373092926, + "grad_norm": 0.39570986963436744, + "learning_rate": 9.22721019878578e-07, + "loss": 0.3286, + "step": 5935 + }, + { + "epoch": 4.116504854368932, + "grad_norm": 0.7735755872938727, + "learning_rate": 9.213203461107278e-07, + "loss": 0.3301, + "step": 5936 + }, + { + "epoch": 4.117198335644938, + "grad_norm": 0.5641395627446331, + "learning_rate": 9.19920628355851e-07, + "loss": 0.3625, + "step": 5937 + }, + { + "epoch": 4.1178918169209435, + "grad_norm": 0.41126362889400475, + "learning_rate": 9.185218669420282e-07, + "loss": 0.3114, + "step": 5938 + }, + { + "epoch": 4.118585298196948, + "grad_norm": 0.44893621491319846, + "learning_rate": 9.1712406219712e-07, + "loss": 0.3674, + "step": 5939 + }, + { + "epoch": 4.119278779472954, + "grad_norm": 0.3922788482957072, + "learning_rate": 9.157272144487634e-07, + "loss": 0.3372, + "step": 5940 + }, + { + "epoch": 4.11997226074896, + "grad_norm": 0.42533536484553314, + "learning_rate": 9.143313240243668e-07, + "loss": 0.3729, + "step": 5941 + }, + { + "epoch": 4.120665742024966, + "grad_norm": 0.39574415186274814, + "learning_rate": 9.129363912511185e-07, + "loss": 0.3431, + "step": 5942 + }, + { + "epoch": 4.121359223300971, + "grad_norm": 0.37377146082736573, + "learning_rate": 9.11542416455981e-07, + "loss": 0.312, + "step": 5943 + }, + { + "epoch": 4.122052704576976, + "grad_norm": 0.38868937143851934, + "learning_rate": 9.101493999656885e-07, + "loss": 0.3245, + "step": 5944 + }, + { + "epoch": 4.122746185852982, + "grad_norm": 0.43829911951592504, + "learning_rate": 9.087573421067591e-07, + "loss": 0.3338, + "step": 5945 + }, + { + "epoch": 4.123439667128988, + "grad_norm": 0.406545811090325, + "learning_rate": 9.073662432054775e-07, + "loss": 0.3589, + "step": 5946 + }, + { + "epoch": 4.124133148404993, + "grad_norm": 0.42980007882605714, + "learning_rate": 9.059761035879083e-07, + "loss": 0.3624, + "step": 5947 + }, + { + "epoch": 4.124826629680999, + "grad_norm": 0.4208110418250585, + "learning_rate": 9.04586923579891e-07, + "loss": 0.3287, + "step": 5948 + }, + { + "epoch": 4.125520110957004, + "grad_norm": 0.4490583891032571, + "learning_rate": 9.031987035070378e-07, + "loss": 0.3536, + "step": 5949 + }, + { + "epoch": 4.12621359223301, + "grad_norm": 0.40240423539407566, + "learning_rate": 9.018114436947373e-07, + "loss": 0.3243, + "step": 5950 + }, + { + "epoch": 4.126907073509015, + "grad_norm": 0.39077472347062414, + "learning_rate": 9.004251444681556e-07, + "loss": 0.3186, + "step": 5951 + }, + { + "epoch": 4.127600554785021, + "grad_norm": 0.40872134210578254, + "learning_rate": 8.990398061522282e-07, + "loss": 0.3755, + "step": 5952 + }, + { + "epoch": 4.1282940360610265, + "grad_norm": 0.40280782359654577, + "learning_rate": 8.976554290716699e-07, + "loss": 0.3417, + "step": 5953 + }, + { + "epoch": 4.128987517337032, + "grad_norm": 0.4519412795671945, + "learning_rate": 8.962720135509678e-07, + "loss": 0.4052, + "step": 5954 + }, + { + "epoch": 4.129680998613037, + "grad_norm": 0.4498345766489382, + "learning_rate": 8.948895599143859e-07, + "loss": 0.3573, + "step": 5955 + }, + { + "epoch": 4.130374479889043, + "grad_norm": 0.8581724152914949, + "learning_rate": 8.935080684859615e-07, + "loss": 0.3591, + "step": 5956 + }, + { + "epoch": 4.131067961165049, + "grad_norm": 0.408925491212508, + "learning_rate": 8.921275395895041e-07, + "loss": 0.3293, + "step": 5957 + }, + { + "epoch": 4.1317614424410545, + "grad_norm": 0.4218063585602398, + "learning_rate": 8.907479735486002e-07, + "loss": 0.3174, + "step": 5958 + }, + { + "epoch": 4.132454923717059, + "grad_norm": 0.4434643511461139, + "learning_rate": 8.893693706866124e-07, + "loss": 0.3907, + "step": 5959 + }, + { + "epoch": 4.133148404993065, + "grad_norm": 0.4006570102175896, + "learning_rate": 8.879917313266728e-07, + "loss": 0.3735, + "step": 5960 + }, + { + "epoch": 4.133841886269071, + "grad_norm": 0.40069527693707424, + "learning_rate": 8.866150557916914e-07, + "loss": 0.3619, + "step": 5961 + }, + { + "epoch": 4.134535367545077, + "grad_norm": 0.4299383824455946, + "learning_rate": 8.852393444043478e-07, + "loss": 0.3409, + "step": 5962 + }, + { + "epoch": 4.1352288488210815, + "grad_norm": 0.3962675644371142, + "learning_rate": 8.838645974871029e-07, + "loss": 0.3119, + "step": 5963 + }, + { + "epoch": 4.135922330097087, + "grad_norm": 0.4933003927011079, + "learning_rate": 8.824908153621875e-07, + "loss": 0.4305, + "step": 5964 + }, + { + "epoch": 4.136615811373093, + "grad_norm": 0.4183634729188962, + "learning_rate": 8.811179983516027e-07, + "loss": 0.3732, + "step": 5965 + }, + { + "epoch": 4.137309292649099, + "grad_norm": 0.40206622991495006, + "learning_rate": 8.797461467771301e-07, + "loss": 0.3261, + "step": 5966 + }, + { + "epoch": 4.138002773925104, + "grad_norm": 0.377053128798786, + "learning_rate": 8.783752609603197e-07, + "loss": 0.338, + "step": 5967 + }, + { + "epoch": 4.1386962552011095, + "grad_norm": 0.40261347438723244, + "learning_rate": 8.770053412224972e-07, + "loss": 0.3026, + "step": 5968 + }, + { + "epoch": 4.139389736477115, + "grad_norm": 0.39967669849322496, + "learning_rate": 8.756363878847646e-07, + "loss": 0.3854, + "step": 5969 + }, + { + "epoch": 4.140083217753121, + "grad_norm": 0.43750729509161024, + "learning_rate": 8.742684012679908e-07, + "loss": 0.3753, + "step": 5970 + }, + { + "epoch": 4.140776699029126, + "grad_norm": 0.36648013514674793, + "learning_rate": 8.729013816928239e-07, + "loss": 0.3214, + "step": 5971 + }, + { + "epoch": 4.141470180305132, + "grad_norm": 0.46374835610045173, + "learning_rate": 8.715353294796835e-07, + "loss": 0.4039, + "step": 5972 + }, + { + "epoch": 4.1421636615811375, + "grad_norm": 0.41435647361787886, + "learning_rate": 8.70170244948762e-07, + "loss": 0.3908, + "step": 5973 + }, + { + "epoch": 4.142857142857143, + "grad_norm": 0.38278049151288557, + "learning_rate": 8.688061284200266e-07, + "loss": 0.3379, + "step": 5974 + }, + { + "epoch": 4.143550624133148, + "grad_norm": 0.4432340706873847, + "learning_rate": 8.67442980213214e-07, + "loss": 0.3513, + "step": 5975 + }, + { + "epoch": 4.144244105409154, + "grad_norm": 0.38333512090241434, + "learning_rate": 8.660808006478371e-07, + "loss": 0.3252, + "step": 5976 + }, + { + "epoch": 4.14493758668516, + "grad_norm": 0.4037543696388098, + "learning_rate": 8.647195900431832e-07, + "loss": 0.3137, + "step": 5977 + }, + { + "epoch": 4.145631067961165, + "grad_norm": 0.44960096169288244, + "learning_rate": 8.633593487183067e-07, + "loss": 0.4103, + "step": 5978 + }, + { + "epoch": 4.14632454923717, + "grad_norm": 0.40445276088021065, + "learning_rate": 8.6200007699204e-07, + "loss": 0.3763, + "step": 5979 + }, + { + "epoch": 4.147018030513176, + "grad_norm": 1.0045933412764532, + "learning_rate": 8.60641775182986e-07, + "loss": 0.3595, + "step": 5980 + }, + { + "epoch": 4.147711511789182, + "grad_norm": 0.7626982371670392, + "learning_rate": 8.592844436095216e-07, + "loss": 0.3185, + "step": 5981 + }, + { + "epoch": 4.148404993065188, + "grad_norm": 0.43361266885090044, + "learning_rate": 8.579280825897968e-07, + "loss": 0.3635, + "step": 5982 + }, + { + "epoch": 4.1490984743411925, + "grad_norm": 0.4164723941003652, + "learning_rate": 8.565726924417295e-07, + "loss": 0.3923, + "step": 5983 + }, + { + "epoch": 4.149791955617198, + "grad_norm": 0.3900446498113785, + "learning_rate": 8.55218273483015e-07, + "loss": 0.3261, + "step": 5984 + }, + { + "epoch": 4.150485436893204, + "grad_norm": 0.36950207591264156, + "learning_rate": 8.538648260311205e-07, + "loss": 0.3114, + "step": 5985 + }, + { + "epoch": 4.15117891816921, + "grad_norm": 0.4048665697589546, + "learning_rate": 8.525123504032817e-07, + "loss": 0.3273, + "step": 5986 + }, + { + "epoch": 4.151872399445215, + "grad_norm": 0.36152353071455035, + "learning_rate": 8.511608469165106e-07, + "loss": 0.3212, + "step": 5987 + }, + { + "epoch": 4.1525658807212205, + "grad_norm": 0.44967533876443433, + "learning_rate": 8.498103158875909e-07, + "loss": 0.3581, + "step": 5988 + }, + { + "epoch": 4.153259361997226, + "grad_norm": 0.3985874198622114, + "learning_rate": 8.484607576330733e-07, + "loss": 0.3388, + "step": 5989 + }, + { + "epoch": 4.153952843273232, + "grad_norm": 0.5023222614355668, + "learning_rate": 8.471121724692905e-07, + "loss": 0.3592, + "step": 5990 + }, + { + "epoch": 4.154646324549237, + "grad_norm": 0.4497067355037071, + "learning_rate": 8.457645607123361e-07, + "loss": 0.3461, + "step": 5991 + }, + { + "epoch": 4.155339805825243, + "grad_norm": 0.3565151091482395, + "learning_rate": 8.444179226780824e-07, + "loss": 0.3374, + "step": 5992 + }, + { + "epoch": 4.156033287101248, + "grad_norm": 0.4077691814134206, + "learning_rate": 8.430722586821721e-07, + "loss": 0.3254, + "step": 5993 + }, + { + "epoch": 4.156726768377254, + "grad_norm": 0.43058583104823883, + "learning_rate": 8.417275690400178e-07, + "loss": 0.3292, + "step": 5994 + }, + { + "epoch": 4.157420249653259, + "grad_norm": 0.40196706942006355, + "learning_rate": 8.403838540668058e-07, + "loss": 0.3636, + "step": 5995 + }, + { + "epoch": 4.158113730929265, + "grad_norm": 0.41010432217836973, + "learning_rate": 8.390411140774945e-07, + "loss": 0.3366, + "step": 5996 + }, + { + "epoch": 4.158807212205271, + "grad_norm": 0.3737938462591656, + "learning_rate": 8.37699349386809e-07, + "loss": 0.3386, + "step": 5997 + }, + { + "epoch": 4.159500693481276, + "grad_norm": 0.4315241239685576, + "learning_rate": 8.363585603092517e-07, + "loss": 0.3938, + "step": 5998 + }, + { + "epoch": 4.160194174757281, + "grad_norm": 0.46110209008123965, + "learning_rate": 8.350187471590937e-07, + "loss": 0.4185, + "step": 5999 + }, + { + "epoch": 4.160887656033287, + "grad_norm": 0.39658353146422376, + "learning_rate": 8.336799102503762e-07, + "loss": 0.376, + "step": 6000 + }, + { + "epoch": 4.161581137309293, + "grad_norm": 0.39401883680631355, + "learning_rate": 8.323420498969159e-07, + "loss": 0.3656, + "step": 6001 + }, + { + "epoch": 4.162274618585299, + "grad_norm": 0.4290292016727245, + "learning_rate": 8.310051664122937e-07, + "loss": 0.3373, + "step": 6002 + }, + { + "epoch": 4.1629680998613035, + "grad_norm": 0.4184393192229492, + "learning_rate": 8.296692601098688e-07, + "loss": 0.3525, + "step": 6003 + }, + { + "epoch": 4.163661581137309, + "grad_norm": 0.42324902300214917, + "learning_rate": 8.283343313027654e-07, + "loss": 0.3027, + "step": 6004 + }, + { + "epoch": 4.164355062413315, + "grad_norm": 0.4434263665341341, + "learning_rate": 8.270003803038817e-07, + "loss": 0.4057, + "step": 6005 + }, + { + "epoch": 4.165048543689321, + "grad_norm": 0.3853786799031601, + "learning_rate": 8.25667407425888e-07, + "loss": 0.3338, + "step": 6006 + }, + { + "epoch": 4.165742024965326, + "grad_norm": 0.40260227349286065, + "learning_rate": 8.243354129812192e-07, + "loss": 0.3522, + "step": 6007 + }, + { + "epoch": 4.166435506241331, + "grad_norm": 0.37600486500593155, + "learning_rate": 8.230043972820895e-07, + "loss": 0.3298, + "step": 6008 + }, + { + "epoch": 4.167128987517337, + "grad_norm": 0.4463058140550583, + "learning_rate": 8.216743606404793e-07, + "loss": 0.3317, + "step": 6009 + }, + { + "epoch": 4.167822468793343, + "grad_norm": 0.377188534850174, + "learning_rate": 8.203453033681368e-07, + "loss": 0.3285, + "step": 6010 + }, + { + "epoch": 4.168515950069348, + "grad_norm": 0.3889852476800571, + "learning_rate": 8.190172257765855e-07, + "loss": 0.3523, + "step": 6011 + }, + { + "epoch": 4.169209431345354, + "grad_norm": 0.38463457226080766, + "learning_rate": 8.176901281771154e-07, + "loss": 0.3563, + "step": 6012 + }, + { + "epoch": 4.169902912621359, + "grad_norm": 0.40416817629559865, + "learning_rate": 8.163640108807897e-07, + "loss": 0.3811, + "step": 6013 + }, + { + "epoch": 4.170596393897365, + "grad_norm": 0.43964283176161695, + "learning_rate": 8.150388741984416e-07, + "loss": 0.3552, + "step": 6014 + }, + { + "epoch": 4.17128987517337, + "grad_norm": 0.4314773291786829, + "learning_rate": 8.137147184406718e-07, + "loss": 0.3339, + "step": 6015 + }, + { + "epoch": 4.171983356449376, + "grad_norm": 0.3670685364278709, + "learning_rate": 8.123915439178531e-07, + "loss": 0.3034, + "step": 6016 + }, + { + "epoch": 4.172676837725382, + "grad_norm": 0.41926676658602996, + "learning_rate": 8.11069350940128e-07, + "loss": 0.3383, + "step": 6017 + }, + { + "epoch": 4.173370319001387, + "grad_norm": 0.4872134877953159, + "learning_rate": 8.097481398174101e-07, + "loss": 0.3494, + "step": 6018 + }, + { + "epoch": 4.174063800277392, + "grad_norm": 0.39761396661231746, + "learning_rate": 8.084279108593818e-07, + "loss": 0.3512, + "step": 6019 + }, + { + "epoch": 4.174757281553398, + "grad_norm": 0.4256801841918406, + "learning_rate": 8.071086643754933e-07, + "loss": 0.3691, + "step": 6020 + }, + { + "epoch": 4.175450762829404, + "grad_norm": 0.4234607184847094, + "learning_rate": 8.057904006749673e-07, + "loss": 0.3333, + "step": 6021 + }, + { + "epoch": 4.1761442441054095, + "grad_norm": 0.4429490869933367, + "learning_rate": 8.044731200667966e-07, + "loss": 0.3713, + "step": 6022 + }, + { + "epoch": 4.176837725381414, + "grad_norm": 0.4475007897306103, + "learning_rate": 8.031568228597403e-07, + "loss": 0.2945, + "step": 6023 + }, + { + "epoch": 4.17753120665742, + "grad_norm": 0.4321793958307334, + "learning_rate": 8.0184150936233e-07, + "loss": 0.415, + "step": 6024 + }, + { + "epoch": 4.178224687933426, + "grad_norm": 0.5080867078775074, + "learning_rate": 8.005271798828646e-07, + "loss": 0.3486, + "step": 6025 + }, + { + "epoch": 4.178918169209432, + "grad_norm": 0.5286780112924887, + "learning_rate": 7.992138347294148e-07, + "loss": 0.38, + "step": 6026 + }, + { + "epoch": 4.179611650485437, + "grad_norm": 0.39056671330678505, + "learning_rate": 7.979014742098196e-07, + "loss": 0.3593, + "step": 6027 + }, + { + "epoch": 4.180305131761442, + "grad_norm": 0.4141165486544236, + "learning_rate": 7.965900986316849e-07, + "loss": 0.3288, + "step": 6028 + }, + { + "epoch": 4.180998613037448, + "grad_norm": 0.4363692200440037, + "learning_rate": 7.952797083023883e-07, + "loss": 0.3561, + "step": 6029 + }, + { + "epoch": 4.181692094313454, + "grad_norm": 0.3898890656791092, + "learning_rate": 7.939703035290774e-07, + "loss": 0.3271, + "step": 6030 + }, + { + "epoch": 4.182385575589459, + "grad_norm": 0.4176585231952008, + "learning_rate": 7.926618846186646e-07, + "loss": 0.3816, + "step": 6031 + }, + { + "epoch": 4.1830790568654646, + "grad_norm": 0.4494198167361945, + "learning_rate": 7.913544518778349e-07, + "loss": 0.3794, + "step": 6032 + }, + { + "epoch": 4.18377253814147, + "grad_norm": 0.37593638717670064, + "learning_rate": 7.900480056130428e-07, + "loss": 0.3497, + "step": 6033 + }, + { + "epoch": 4.184466019417476, + "grad_norm": 0.5669971707711042, + "learning_rate": 7.887425461305059e-07, + "loss": 0.3476, + "step": 6034 + }, + { + "epoch": 4.185159500693481, + "grad_norm": 0.4724069941554161, + "learning_rate": 7.874380737362186e-07, + "loss": 0.3518, + "step": 6035 + }, + { + "epoch": 4.185852981969487, + "grad_norm": 0.3758445148826483, + "learning_rate": 7.861345887359372e-07, + "loss": 0.3323, + "step": 6036 + }, + { + "epoch": 4.1865464632454925, + "grad_norm": 0.512148301918811, + "learning_rate": 7.848320914351903e-07, + "loss": 0.3346, + "step": 6037 + }, + { + "epoch": 4.187239944521498, + "grad_norm": 0.45113931383264844, + "learning_rate": 7.835305821392741e-07, + "loss": 0.3678, + "step": 6038 + }, + { + "epoch": 4.187933425797503, + "grad_norm": 0.39604189770432496, + "learning_rate": 7.822300611532513e-07, + "loss": 0.2974, + "step": 6039 + }, + { + "epoch": 4.188626907073509, + "grad_norm": 0.4020553967608756, + "learning_rate": 7.809305287819557e-07, + "loss": 0.3334, + "step": 6040 + }, + { + "epoch": 4.189320388349515, + "grad_norm": 0.5780842483561379, + "learning_rate": 7.7963198532999e-07, + "loss": 0.3646, + "step": 6041 + }, + { + "epoch": 4.1900138696255205, + "grad_norm": 0.36381579064490266, + "learning_rate": 7.783344311017183e-07, + "loss": 0.3384, + "step": 6042 + }, + { + "epoch": 4.190707350901525, + "grad_norm": 0.36730056127974264, + "learning_rate": 7.770378664012839e-07, + "loss": 0.3294, + "step": 6043 + }, + { + "epoch": 4.191400832177531, + "grad_norm": 0.43717286925053483, + "learning_rate": 7.757422915325885e-07, + "loss": 0.3287, + "step": 6044 + }, + { + "epoch": 4.192094313453537, + "grad_norm": 0.40476188782189276, + "learning_rate": 7.744477067993061e-07, + "loss": 0.367, + "step": 6045 + }, + { + "epoch": 4.192787794729543, + "grad_norm": 0.45175950463074016, + "learning_rate": 7.731541125048798e-07, + "loss": 0.3167, + "step": 6046 + }, + { + "epoch": 4.1934812760055475, + "grad_norm": 0.5092667367776222, + "learning_rate": 7.718615089525161e-07, + "loss": 0.3613, + "step": 6047 + }, + { + "epoch": 4.194174757281553, + "grad_norm": 0.8766702068560327, + "learning_rate": 7.705698964451941e-07, + "loss": 0.3363, + "step": 6048 + }, + { + "epoch": 4.194868238557559, + "grad_norm": 0.41008619122123174, + "learning_rate": 7.692792752856564e-07, + "loss": 0.291, + "step": 6049 + }, + { + "epoch": 4.195561719833565, + "grad_norm": 0.41500164653505817, + "learning_rate": 7.679896457764164e-07, + "loss": 0.3666, + "step": 6050 + }, + { + "epoch": 4.19625520110957, + "grad_norm": 0.42725730863798383, + "learning_rate": 7.667010082197534e-07, + "loss": 0.3482, + "step": 6051 + }, + { + "epoch": 4.1969486823855755, + "grad_norm": 0.4148846355993886, + "learning_rate": 7.654133629177152e-07, + "loss": 0.3782, + "step": 6052 + }, + { + "epoch": 4.197642163661581, + "grad_norm": 0.39087434176662567, + "learning_rate": 7.641267101721179e-07, + "loss": 0.3146, + "step": 6053 + }, + { + "epoch": 4.198335644937587, + "grad_norm": 0.4047503038136691, + "learning_rate": 7.628410502845401e-07, + "loss": 0.3263, + "step": 6054 + }, + { + "epoch": 4.199029126213592, + "grad_norm": 0.3849298003850758, + "learning_rate": 7.615563835563339e-07, + "loss": 0.333, + "step": 6055 + }, + { + "epoch": 4.199722607489598, + "grad_norm": 0.4547638404162615, + "learning_rate": 7.602727102886165e-07, + "loss": 0.352, + "step": 6056 + }, + { + "epoch": 4.2004160887656035, + "grad_norm": 0.3783034975932897, + "learning_rate": 7.589900307822684e-07, + "loss": 0.3541, + "step": 6057 + }, + { + "epoch": 4.201109570041609, + "grad_norm": 0.39462513301232205, + "learning_rate": 7.577083453379425e-07, + "loss": 0.3497, + "step": 6058 + }, + { + "epoch": 4.201803051317614, + "grad_norm": 0.5958447544116373, + "learning_rate": 7.564276542560578e-07, + "loss": 0.3593, + "step": 6059 + }, + { + "epoch": 4.20249653259362, + "grad_norm": 0.3887050554245725, + "learning_rate": 7.551479578367948e-07, + "loss": 0.342, + "step": 6060 + }, + { + "epoch": 4.203190013869626, + "grad_norm": 0.40702570868356264, + "learning_rate": 7.538692563801103e-07, + "loss": 0.3351, + "step": 6061 + }, + { + "epoch": 4.203883495145631, + "grad_norm": 0.3722924768078192, + "learning_rate": 7.525915501857189e-07, + "loss": 0.313, + "step": 6062 + }, + { + "epoch": 4.204576976421636, + "grad_norm": 0.40094754439997143, + "learning_rate": 7.513148395531073e-07, + "loss": 0.3444, + "step": 6063 + }, + { + "epoch": 4.205270457697642, + "grad_norm": 0.40667576909304676, + "learning_rate": 7.50039124781528e-07, + "loss": 0.3824, + "step": 6064 + }, + { + "epoch": 4.205963938973648, + "grad_norm": 0.5936452773907123, + "learning_rate": 7.487644061699966e-07, + "loss": 0.3355, + "step": 6065 + }, + { + "epoch": 4.206657420249654, + "grad_norm": 0.46290525806142524, + "learning_rate": 7.474906840173001e-07, + "loss": 0.3634, + "step": 6066 + }, + { + "epoch": 4.2073509015256585, + "grad_norm": 0.4565555658930163, + "learning_rate": 7.462179586219897e-07, + "loss": 0.3311, + "step": 6067 + }, + { + "epoch": 4.208044382801664, + "grad_norm": 0.46827195103233166, + "learning_rate": 7.449462302823818e-07, + "loss": 0.3484, + "step": 6068 + }, + { + "epoch": 4.20873786407767, + "grad_norm": 0.3895267045665284, + "learning_rate": 7.436754992965606e-07, + "loss": 0.3438, + "step": 6069 + }, + { + "epoch": 4.209431345353676, + "grad_norm": 0.47370338584444854, + "learning_rate": 7.424057659623767e-07, + "loss": 0.4025, + "step": 6070 + }, + { + "epoch": 4.210124826629681, + "grad_norm": 0.39832369121029065, + "learning_rate": 7.411370305774468e-07, + "loss": 0.3472, + "step": 6071 + }, + { + "epoch": 4.2108183079056865, + "grad_norm": 0.4029940029859894, + "learning_rate": 7.398692934391532e-07, + "loss": 0.3454, + "step": 6072 + }, + { + "epoch": 4.211511789181692, + "grad_norm": 0.4834483036173078, + "learning_rate": 7.386025548446435e-07, + "loss": 0.3544, + "step": 6073 + }, + { + "epoch": 4.212205270457698, + "grad_norm": 0.4857192886392201, + "learning_rate": 7.373368150908316e-07, + "loss": 0.3783, + "step": 6074 + }, + { + "epoch": 4.212898751733703, + "grad_norm": 0.39459037231814265, + "learning_rate": 7.360720744744004e-07, + "loss": 0.3306, + "step": 6075 + }, + { + "epoch": 4.213592233009709, + "grad_norm": 0.3910367689324389, + "learning_rate": 7.348083332917927e-07, + "loss": 0.3484, + "step": 6076 + }, + { + "epoch": 4.214285714285714, + "grad_norm": 0.43108697736448265, + "learning_rate": 7.33545591839222e-07, + "loss": 0.3341, + "step": 6077 + }, + { + "epoch": 4.21497919556172, + "grad_norm": 0.4354529586504891, + "learning_rate": 7.322838504126651e-07, + "loss": 0.3752, + "step": 6078 + }, + { + "epoch": 4.215672676837725, + "grad_norm": 0.4277510314031144, + "learning_rate": 7.310231093078657e-07, + "loss": 0.3444, + "step": 6079 + }, + { + "epoch": 4.216366158113731, + "grad_norm": 0.4203538970356623, + "learning_rate": 7.297633688203332e-07, + "loss": 0.3083, + "step": 6080 + }, + { + "epoch": 4.217059639389737, + "grad_norm": 0.438697512780434, + "learning_rate": 7.28504629245339e-07, + "loss": 0.3546, + "step": 6081 + }, + { + "epoch": 4.217753120665742, + "grad_norm": 0.4618963687939622, + "learning_rate": 7.272468908779245e-07, + "loss": 0.2975, + "step": 6082 + }, + { + "epoch": 4.218446601941747, + "grad_norm": 0.393360744750457, + "learning_rate": 7.25990154012895e-07, + "loss": 0.3558, + "step": 6083 + }, + { + "epoch": 4.219140083217753, + "grad_norm": 0.40685227710677674, + "learning_rate": 7.247344189448186e-07, + "loss": 0.3241, + "step": 6084 + }, + { + "epoch": 4.219833564493759, + "grad_norm": 0.9771687624258758, + "learning_rate": 7.23479685968031e-07, + "loss": 0.3544, + "step": 6085 + }, + { + "epoch": 4.220527045769765, + "grad_norm": 0.3881488861077096, + "learning_rate": 7.222259553766348e-07, + "loss": 0.3293, + "step": 6086 + }, + { + "epoch": 4.221220527045769, + "grad_norm": 0.38743087425628003, + "learning_rate": 7.20973227464491e-07, + "loss": 0.3428, + "step": 6087 + }, + { + "epoch": 4.221914008321775, + "grad_norm": 0.4626890137947432, + "learning_rate": 7.197215025252347e-07, + "loss": 0.3759, + "step": 6088 + }, + { + "epoch": 4.222607489597781, + "grad_norm": 0.8928292265606828, + "learning_rate": 7.184707808522578e-07, + "loss": 0.3556, + "step": 6089 + }, + { + "epoch": 4.223300970873787, + "grad_norm": 0.37934929811524004, + "learning_rate": 7.172210627387216e-07, + "loss": 0.3569, + "step": 6090 + }, + { + "epoch": 4.223994452149792, + "grad_norm": 0.4136645989588592, + "learning_rate": 7.159723484775522e-07, + "loss": 0.3466, + "step": 6091 + }, + { + "epoch": 4.224687933425797, + "grad_norm": 0.39614707350283174, + "learning_rate": 7.14724638361437e-07, + "loss": 0.3327, + "step": 6092 + }, + { + "epoch": 4.225381414701803, + "grad_norm": 0.4049604409686824, + "learning_rate": 7.134779326828317e-07, + "loss": 0.3859, + "step": 6093 + }, + { + "epoch": 4.226074895977809, + "grad_norm": 2.7061576392777082, + "learning_rate": 7.122322317339542e-07, + "loss": 0.3291, + "step": 6094 + }, + { + "epoch": 4.226768377253814, + "grad_norm": 0.45460105757616354, + "learning_rate": 7.109875358067875e-07, + "loss": 0.352, + "step": 6095 + }, + { + "epoch": 4.22746185852982, + "grad_norm": 0.382801251984125, + "learning_rate": 7.0974384519308e-07, + "loss": 0.369, + "step": 6096 + }, + { + "epoch": 4.228155339805825, + "grad_norm": 0.5070626489124913, + "learning_rate": 7.085011601843439e-07, + "loss": 0.3465, + "step": 6097 + }, + { + "epoch": 4.228848821081831, + "grad_norm": 0.4075644593585711, + "learning_rate": 7.072594810718564e-07, + "loss": 0.3506, + "step": 6098 + }, + { + "epoch": 4.229542302357836, + "grad_norm": 0.4223945750640261, + "learning_rate": 7.060188081466556e-07, + "loss": 0.3869, + "step": 6099 + }, + { + "epoch": 4.230235783633842, + "grad_norm": 0.4251773787927419, + "learning_rate": 7.04779141699548e-07, + "loss": 0.3489, + "step": 6100 + }, + { + "epoch": 4.2309292649098476, + "grad_norm": 0.40065375915387225, + "learning_rate": 7.035404820211034e-07, + "loss": 0.3374, + "step": 6101 + }, + { + "epoch": 4.231622746185853, + "grad_norm": 0.5339284059432036, + "learning_rate": 7.02302829401652e-07, + "loss": 0.3644, + "step": 6102 + }, + { + "epoch": 4.232316227461858, + "grad_norm": 0.38640013398988654, + "learning_rate": 7.010661841312921e-07, + "loss": 0.3145, + "step": 6103 + }, + { + "epoch": 4.233009708737864, + "grad_norm": 0.42984289939723686, + "learning_rate": 6.998305464998856e-07, + "loss": 0.3982, + "step": 6104 + }, + { + "epoch": 4.23370319001387, + "grad_norm": 0.41997515202628516, + "learning_rate": 6.98595916797053e-07, + "loss": 0.366, + "step": 6105 + }, + { + "epoch": 4.2343966712898755, + "grad_norm": 0.42410292693418583, + "learning_rate": 6.973622953121878e-07, + "loss": 0.3858, + "step": 6106 + }, + { + "epoch": 4.23509015256588, + "grad_norm": 1.6808598567672512, + "learning_rate": 6.961296823344388e-07, + "loss": 0.3145, + "step": 6107 + }, + { + "epoch": 4.235783633841886, + "grad_norm": 0.4062479700908595, + "learning_rate": 6.948980781527214e-07, + "loss": 0.3542, + "step": 6108 + }, + { + "epoch": 4.236477115117892, + "grad_norm": 0.42045071797037775, + "learning_rate": 6.936674830557167e-07, + "loss": 0.3669, + "step": 6109 + }, + { + "epoch": 4.237170596393898, + "grad_norm": 0.4016978652511189, + "learning_rate": 6.924378973318651e-07, + "loss": 0.3565, + "step": 6110 + }, + { + "epoch": 4.237864077669903, + "grad_norm": 0.3964139730815441, + "learning_rate": 6.912093212693738e-07, + "loss": 0.3306, + "step": 6111 + }, + { + "epoch": 4.238557558945908, + "grad_norm": 0.4616711831972742, + "learning_rate": 6.899817551562127e-07, + "loss": 0.3354, + "step": 6112 + }, + { + "epoch": 4.239251040221914, + "grad_norm": 0.4031277226578537, + "learning_rate": 6.887551992801123e-07, + "loss": 0.3124, + "step": 6113 + }, + { + "epoch": 4.23994452149792, + "grad_norm": 0.3825425788061083, + "learning_rate": 6.875296539285697e-07, + "loss": 0.3284, + "step": 6114 + }, + { + "epoch": 4.240638002773925, + "grad_norm": 0.5139002770828404, + "learning_rate": 6.863051193888443e-07, + "loss": 0.3795, + "step": 6115 + }, + { + "epoch": 4.2413314840499305, + "grad_norm": 0.4457720003365879, + "learning_rate": 6.850815959479573e-07, + "loss": 0.345, + "step": 6116 + }, + { + "epoch": 4.242024965325936, + "grad_norm": 0.41310283107303697, + "learning_rate": 6.838590838926951e-07, + "loss": 0.3024, + "step": 6117 + }, + { + "epoch": 4.242718446601942, + "grad_norm": 0.4375598526906385, + "learning_rate": 6.826375835096038e-07, + "loss": 0.3356, + "step": 6118 + }, + { + "epoch": 4.243411927877947, + "grad_norm": 0.5014727873065993, + "learning_rate": 6.814170950849952e-07, + "loss": 0.3702, + "step": 6119 + }, + { + "epoch": 4.244105409153953, + "grad_norm": 0.46374848345406067, + "learning_rate": 6.801976189049436e-07, + "loss": 0.3669, + "step": 6120 + }, + { + "epoch": 4.2447988904299585, + "grad_norm": 0.3987952691796923, + "learning_rate": 6.789791552552838e-07, + "loss": 0.3382, + "step": 6121 + }, + { + "epoch": 4.245492371705964, + "grad_norm": 0.4623724769354124, + "learning_rate": 6.777617044216161e-07, + "loss": 0.3275, + "step": 6122 + }, + { + "epoch": 4.246185852981969, + "grad_norm": 0.47179020358321583, + "learning_rate": 6.765452666893013e-07, + "loss": 0.3714, + "step": 6123 + }, + { + "epoch": 4.246879334257975, + "grad_norm": 0.3920320920597318, + "learning_rate": 6.75329842343464e-07, + "loss": 0.3267, + "step": 6124 + }, + { + "epoch": 4.247572815533981, + "grad_norm": 0.4637309647230182, + "learning_rate": 6.741154316689918e-07, + "loss": 0.2894, + "step": 6125 + }, + { + "epoch": 4.2482662968099865, + "grad_norm": 0.38636002550708953, + "learning_rate": 6.729020349505322e-07, + "loss": 0.339, + "step": 6126 + }, + { + "epoch": 4.248959778085991, + "grad_norm": 0.40719786163839855, + "learning_rate": 6.716896524724975e-07, + "loss": 0.3415, + "step": 6127 + }, + { + "epoch": 4.249653259361997, + "grad_norm": 0.4352294958256507, + "learning_rate": 6.704782845190622e-07, + "loss": 0.3587, + "step": 6128 + }, + { + "epoch": 4.250346740638003, + "grad_norm": 0.4077453097774442, + "learning_rate": 6.692679313741596e-07, + "loss": 0.3563, + "step": 6129 + }, + { + "epoch": 4.251040221914009, + "grad_norm": 0.4146069695584132, + "learning_rate": 6.680585933214895e-07, + "loss": 0.3169, + "step": 6130 + }, + { + "epoch": 4.2517337031900135, + "grad_norm": 0.4942546675884171, + "learning_rate": 6.668502706445129e-07, + "loss": 0.3472, + "step": 6131 + }, + { + "epoch": 4.252427184466019, + "grad_norm": 0.47257267893951527, + "learning_rate": 6.656429636264483e-07, + "loss": 0.3582, + "step": 6132 + }, + { + "epoch": 4.253120665742025, + "grad_norm": 0.4030679496912566, + "learning_rate": 6.644366725502844e-07, + "loss": 0.3536, + "step": 6133 + }, + { + "epoch": 4.253814147018031, + "grad_norm": 0.420845710532777, + "learning_rate": 6.632313976987637e-07, + "loss": 0.3116, + "step": 6134 + }, + { + "epoch": 4.254507628294036, + "grad_norm": 0.4339162212423455, + "learning_rate": 6.620271393543954e-07, + "loss": 0.3534, + "step": 6135 + }, + { + "epoch": 4.2552011095700415, + "grad_norm": 0.43094117989243275, + "learning_rate": 6.608238977994491e-07, + "loss": 0.3845, + "step": 6136 + }, + { + "epoch": 4.255894590846047, + "grad_norm": 0.4114484366764265, + "learning_rate": 6.596216733159544e-07, + "loss": 0.3377, + "step": 6137 + }, + { + "epoch": 4.256588072122053, + "grad_norm": 0.42784565493387683, + "learning_rate": 6.584204661857063e-07, + "loss": 0.3363, + "step": 6138 + }, + { + "epoch": 4.257281553398058, + "grad_norm": 0.4453671767248613, + "learning_rate": 6.572202766902569e-07, + "loss": 0.3485, + "step": 6139 + }, + { + "epoch": 4.257975034674064, + "grad_norm": 0.6829864492937774, + "learning_rate": 6.560211051109222e-07, + "loss": 0.3758, + "step": 6140 + }, + { + "epoch": 4.2586685159500695, + "grad_norm": 0.41882460832398527, + "learning_rate": 6.548229517287802e-07, + "loss": 0.3665, + "step": 6141 + }, + { + "epoch": 4.259361997226075, + "grad_norm": 0.4046853028028085, + "learning_rate": 6.53625816824669e-07, + "loss": 0.3371, + "step": 6142 + }, + { + "epoch": 4.26005547850208, + "grad_norm": 0.4067265391069463, + "learning_rate": 6.524297006791891e-07, + "loss": 0.3405, + "step": 6143 + }, + { + "epoch": 4.260748959778086, + "grad_norm": 0.43429700655270675, + "learning_rate": 6.512346035727002e-07, + "loss": 0.3675, + "step": 6144 + }, + { + "epoch": 4.261442441054092, + "grad_norm": 0.4036417739625904, + "learning_rate": 6.500405257853249e-07, + "loss": 0.3486, + "step": 6145 + }, + { + "epoch": 4.262135922330097, + "grad_norm": 0.42098798131586823, + "learning_rate": 6.488474675969475e-07, + "loss": 0.3478, + "step": 6146 + }, + { + "epoch": 4.262829403606102, + "grad_norm": 0.3916550706558104, + "learning_rate": 6.476554292872101e-07, + "loss": 0.3865, + "step": 6147 + }, + { + "epoch": 4.263522884882108, + "grad_norm": 0.3901404966600708, + "learning_rate": 6.46464411135519e-07, + "loss": 0.3003, + "step": 6148 + }, + { + "epoch": 4.264216366158114, + "grad_norm": 0.44135706046747253, + "learning_rate": 6.452744134210409e-07, + "loss": 0.3262, + "step": 6149 + }, + { + "epoch": 4.26490984743412, + "grad_norm": 0.393616806805249, + "learning_rate": 6.440854364227e-07, + "loss": 0.3376, + "step": 6150 + }, + { + "epoch": 4.2656033287101245, + "grad_norm": 0.49525311947866135, + "learning_rate": 6.428974804191879e-07, + "loss": 0.3261, + "step": 6151 + }, + { + "epoch": 4.26629680998613, + "grad_norm": 0.47144963602483675, + "learning_rate": 6.4171054568895e-07, + "loss": 0.3503, + "step": 6152 + }, + { + "epoch": 4.266990291262136, + "grad_norm": 0.6374627627921992, + "learning_rate": 6.405246325101955e-07, + "loss": 0.3802, + "step": 6153 + }, + { + "epoch": 4.267683772538142, + "grad_norm": 0.383849705197869, + "learning_rate": 6.393397411608954e-07, + "loss": 0.3505, + "step": 6154 + }, + { + "epoch": 4.268377253814147, + "grad_norm": 0.5479555072255522, + "learning_rate": 6.38155871918778e-07, + "loss": 0.3897, + "step": 6155 + }, + { + "epoch": 4.2690707350901524, + "grad_norm": 0.4261171194009753, + "learning_rate": 6.369730250613337e-07, + "loss": 0.3601, + "step": 6156 + }, + { + "epoch": 4.269764216366158, + "grad_norm": 0.4239048841822734, + "learning_rate": 6.357912008658151e-07, + "loss": 0.3419, + "step": 6157 + }, + { + "epoch": 4.270457697642164, + "grad_norm": 0.4657155620165313, + "learning_rate": 6.346103996092313e-07, + "loss": 0.3165, + "step": 6158 + }, + { + "epoch": 4.271151178918169, + "grad_norm": 0.5491868563638476, + "learning_rate": 6.334306215683533e-07, + "loss": 0.3789, + "step": 6159 + }, + { + "epoch": 4.271844660194175, + "grad_norm": 0.4319250838472781, + "learning_rate": 6.322518670197142e-07, + "loss": 0.392, + "step": 6160 + }, + { + "epoch": 4.27253814147018, + "grad_norm": 0.4328432241317883, + "learning_rate": 6.310741362396044e-07, + "loss": 0.319, + "step": 6161 + }, + { + "epoch": 4.273231622746186, + "grad_norm": 0.3736659692890957, + "learning_rate": 6.298974295040771e-07, + "loss": 0.3671, + "step": 6162 + }, + { + "epoch": 4.273925104022191, + "grad_norm": 0.40190227149262936, + "learning_rate": 6.287217470889412e-07, + "loss": 0.3638, + "step": 6163 + }, + { + "epoch": 4.274618585298197, + "grad_norm": 0.713328387990018, + "learning_rate": 6.275470892697699e-07, + "loss": 0.3158, + "step": 6164 + }, + { + "epoch": 4.275312066574203, + "grad_norm": 0.46495555001833455, + "learning_rate": 6.263734563218949e-07, + "loss": 0.3586, + "step": 6165 + }, + { + "epoch": 4.276005547850208, + "grad_norm": 0.43175522940112815, + "learning_rate": 6.252008485204053e-07, + "loss": 0.386, + "step": 6166 + }, + { + "epoch": 4.276699029126213, + "grad_norm": 1.1876253593297061, + "learning_rate": 6.240292661401531e-07, + "loss": 0.3104, + "step": 6167 + }, + { + "epoch": 4.277392510402219, + "grad_norm": 0.37604448193583445, + "learning_rate": 6.228587094557487e-07, + "loss": 0.328, + "step": 6168 + }, + { + "epoch": 4.278085991678225, + "grad_norm": 0.4312877507264945, + "learning_rate": 6.216891787415618e-07, + "loss": 0.3199, + "step": 6169 + }, + { + "epoch": 4.278779472954231, + "grad_norm": 0.46327010950639513, + "learning_rate": 6.205206742717235e-07, + "loss": 0.3204, + "step": 6170 + }, + { + "epoch": 4.279472954230235, + "grad_norm": 0.4300735295759126, + "learning_rate": 6.193531963201204e-07, + "loss": 0.3708, + "step": 6171 + }, + { + "epoch": 4.280166435506241, + "grad_norm": 0.406823148007132, + "learning_rate": 6.181867451604017e-07, + "loss": 0.4043, + "step": 6172 + }, + { + "epoch": 4.280859916782247, + "grad_norm": 0.5703469794329353, + "learning_rate": 6.17021321065977e-07, + "loss": 0.3539, + "step": 6173 + }, + { + "epoch": 4.281553398058253, + "grad_norm": 0.4194635742416221, + "learning_rate": 6.158569243100098e-07, + "loss": 0.345, + "step": 6174 + }, + { + "epoch": 4.282246879334258, + "grad_norm": 0.39675649447701894, + "learning_rate": 6.146935551654298e-07, + "loss": 0.3916, + "step": 6175 + }, + { + "epoch": 4.282940360610263, + "grad_norm": 0.3940883541329325, + "learning_rate": 6.135312139049194e-07, + "loss": 0.3705, + "step": 6176 + }, + { + "epoch": 4.283633841886269, + "grad_norm": 0.3779188880551679, + "learning_rate": 6.123699008009226e-07, + "loss": 0.3421, + "step": 6177 + }, + { + "epoch": 4.284327323162275, + "grad_norm": 0.4424265070556396, + "learning_rate": 6.11209616125647e-07, + "loss": 0.3658, + "step": 6178 + }, + { + "epoch": 4.28502080443828, + "grad_norm": 0.4250513685627804, + "learning_rate": 6.10050360151051e-07, + "loss": 0.3291, + "step": 6179 + }, + { + "epoch": 4.285714285714286, + "grad_norm": 0.45555341494865514, + "learning_rate": 6.088921331488568e-07, + "loss": 0.359, + "step": 6180 + }, + { + "epoch": 4.286407766990291, + "grad_norm": 0.43125831610772375, + "learning_rate": 6.077349353905465e-07, + "loss": 0.3107, + "step": 6181 + }, + { + "epoch": 4.287101248266297, + "grad_norm": 0.4039363152679757, + "learning_rate": 6.065787671473556e-07, + "loss": 0.3776, + "step": 6182 + }, + { + "epoch": 4.287794729542302, + "grad_norm": 0.38540240716953844, + "learning_rate": 6.054236286902837e-07, + "loss": 0.3467, + "step": 6183 + }, + { + "epoch": 4.288488210818308, + "grad_norm": 0.4579479097403802, + "learning_rate": 6.042695202900855e-07, + "loss": 0.3558, + "step": 6184 + }, + { + "epoch": 4.2891816920943135, + "grad_norm": 0.423682803441983, + "learning_rate": 6.031164422172764e-07, + "loss": 0.3732, + "step": 6185 + }, + { + "epoch": 4.289875173370319, + "grad_norm": 0.3877837283274081, + "learning_rate": 6.019643947421294e-07, + "loss": 0.327, + "step": 6186 + }, + { + "epoch": 4.290568654646324, + "grad_norm": 0.4520411830361061, + "learning_rate": 6.008133781346764e-07, + "loss": 0.3852, + "step": 6187 + }, + { + "epoch": 4.29126213592233, + "grad_norm": 0.5937281099550518, + "learning_rate": 5.996633926647083e-07, + "loss": 0.3452, + "step": 6188 + }, + { + "epoch": 4.291955617198336, + "grad_norm": 0.417706808891538, + "learning_rate": 5.985144386017711e-07, + "loss": 0.3901, + "step": 6189 + }, + { + "epoch": 4.2926490984743415, + "grad_norm": 0.41980789291666265, + "learning_rate": 5.973665162151721e-07, + "loss": 0.3793, + "step": 6190 + }, + { + "epoch": 4.293342579750346, + "grad_norm": 0.4472645091665417, + "learning_rate": 5.962196257739778e-07, + "loss": 0.3072, + "step": 6191 + }, + { + "epoch": 4.294036061026352, + "grad_norm": 0.6167083905787968, + "learning_rate": 5.950737675470081e-07, + "loss": 0.3215, + "step": 6192 + }, + { + "epoch": 4.294729542302358, + "grad_norm": 0.42346397021702487, + "learning_rate": 5.939289418028455e-07, + "loss": 0.3411, + "step": 6193 + }, + { + "epoch": 4.295423023578364, + "grad_norm": 0.4414484901294475, + "learning_rate": 5.927851488098297e-07, + "loss": 0.3803, + "step": 6194 + }, + { + "epoch": 4.296116504854369, + "grad_norm": 0.5654918787529339, + "learning_rate": 5.916423888360546e-07, + "loss": 0.3475, + "step": 6195 + }, + { + "epoch": 4.296809986130374, + "grad_norm": 0.40382204330186966, + "learning_rate": 5.905006621493787e-07, + "loss": 0.345, + "step": 6196 + }, + { + "epoch": 4.29750346740638, + "grad_norm": 0.6680270898026206, + "learning_rate": 5.893599690174113e-07, + "loss": 0.3631, + "step": 6197 + }, + { + "epoch": 4.298196948682386, + "grad_norm": 0.4477827265195121, + "learning_rate": 5.88220309707524e-07, + "loss": 0.37, + "step": 6198 + }, + { + "epoch": 4.298890429958391, + "grad_norm": 0.9396659341260173, + "learning_rate": 5.870816844868454e-07, + "loss": 0.3488, + "step": 6199 + }, + { + "epoch": 4.2995839112343965, + "grad_norm": 0.42597375744288096, + "learning_rate": 5.859440936222588e-07, + "loss": 0.3244, + "step": 6200 + }, + { + "epoch": 4.300277392510402, + "grad_norm": 0.40344659290859214, + "learning_rate": 5.848075373804091e-07, + "loss": 0.3465, + "step": 6201 + }, + { + "epoch": 4.300970873786408, + "grad_norm": 0.38904474351836377, + "learning_rate": 5.836720160276971e-07, + "loss": 0.3514, + "step": 6202 + }, + { + "epoch": 4.301664355062413, + "grad_norm": 0.4250164064598548, + "learning_rate": 5.825375298302788e-07, + "loss": 0.3493, + "step": 6203 + }, + { + "epoch": 4.302357836338419, + "grad_norm": 0.3879367816339768, + "learning_rate": 5.814040790540709e-07, + "loss": 0.3584, + "step": 6204 + }, + { + "epoch": 4.3030513176144245, + "grad_norm": 0.4094681652813443, + "learning_rate": 5.80271663964746e-07, + "loss": 0.3432, + "step": 6205 + }, + { + "epoch": 4.30374479889043, + "grad_norm": 0.4356177516472818, + "learning_rate": 5.791402848277338e-07, + "loss": 0.3526, + "step": 6206 + }, + { + "epoch": 4.304438280166435, + "grad_norm": 0.42138383489532405, + "learning_rate": 5.780099419082225e-07, + "loss": 0.371, + "step": 6207 + }, + { + "epoch": 4.305131761442441, + "grad_norm": 0.41835172082853456, + "learning_rate": 5.768806354711542e-07, + "loss": 0.3453, + "step": 6208 + }, + { + "epoch": 4.305825242718447, + "grad_norm": 0.41561602078900267, + "learning_rate": 5.757523657812314e-07, + "loss": 0.374, + "step": 6209 + }, + { + "epoch": 4.3065187239944525, + "grad_norm": 0.45871631927087475, + "learning_rate": 5.746251331029129e-07, + "loss": 0.4013, + "step": 6210 + }, + { + "epoch": 4.307212205270457, + "grad_norm": 0.3994641431752328, + "learning_rate": 5.734989377004119e-07, + "loss": 0.3125, + "step": 6211 + }, + { + "epoch": 4.307905686546463, + "grad_norm": 0.40984984446095474, + "learning_rate": 5.723737798377021e-07, + "loss": 0.3997, + "step": 6212 + }, + { + "epoch": 4.308599167822469, + "grad_norm": 0.41280406942887543, + "learning_rate": 5.712496597785122e-07, + "loss": 0.3075, + "step": 6213 + }, + { + "epoch": 4.309292649098475, + "grad_norm": 0.37925251180171826, + "learning_rate": 5.701265777863268e-07, + "loss": 0.3312, + "step": 6214 + }, + { + "epoch": 4.3099861303744795, + "grad_norm": 0.38193550940595977, + "learning_rate": 5.690045341243905e-07, + "loss": 0.3328, + "step": 6215 + }, + { + "epoch": 4.310679611650485, + "grad_norm": 0.397144039490869, + "learning_rate": 5.678835290556995e-07, + "loss": 0.335, + "step": 6216 + }, + { + "epoch": 4.311373092926491, + "grad_norm": 0.37757178720838747, + "learning_rate": 5.667635628430102e-07, + "loss": 0.3237, + "step": 6217 + }, + { + "epoch": 4.312066574202497, + "grad_norm": 0.4136908646046677, + "learning_rate": 5.65644635748836e-07, + "loss": 0.3471, + "step": 6218 + }, + { + "epoch": 4.312760055478502, + "grad_norm": 0.3860292890394113, + "learning_rate": 5.645267480354427e-07, + "loss": 0.3291, + "step": 6219 + }, + { + "epoch": 4.3134535367545075, + "grad_norm": 0.3939865909723036, + "learning_rate": 5.63409899964858e-07, + "loss": 0.361, + "step": 6220 + }, + { + "epoch": 4.314147018030513, + "grad_norm": 0.3810758534510466, + "learning_rate": 5.62294091798859e-07, + "loss": 0.3643, + "step": 6221 + }, + { + "epoch": 4.314840499306519, + "grad_norm": 0.42709070042461467, + "learning_rate": 5.611793237989866e-07, + "loss": 0.362, + "step": 6222 + }, + { + "epoch": 4.315533980582524, + "grad_norm": 0.42060904025699275, + "learning_rate": 5.600655962265345e-07, + "loss": 0.3858, + "step": 6223 + }, + { + "epoch": 4.31622746185853, + "grad_norm": 0.37044351820730226, + "learning_rate": 5.589529093425495e-07, + "loss": 0.2944, + "step": 6224 + }, + { + "epoch": 4.3169209431345354, + "grad_norm": 0.4018705583531944, + "learning_rate": 5.578412634078406e-07, + "loss": 0.3219, + "step": 6225 + }, + { + "epoch": 4.317614424410541, + "grad_norm": 0.4552219393699127, + "learning_rate": 5.567306586829668e-07, + "loss": 0.3141, + "step": 6226 + }, + { + "epoch": 4.318307905686546, + "grad_norm": 0.43884630038911576, + "learning_rate": 5.556210954282465e-07, + "loss": 0.3614, + "step": 6227 + }, + { + "epoch": 4.319001386962552, + "grad_norm": 0.41863593852180997, + "learning_rate": 5.545125739037555e-07, + "loss": 0.4291, + "step": 6228 + }, + { + "epoch": 4.319694868238558, + "grad_norm": 0.7679789964136878, + "learning_rate": 5.534050943693197e-07, + "loss": 0.3392, + "step": 6229 + }, + { + "epoch": 4.320388349514563, + "grad_norm": 0.38067444619224, + "learning_rate": 5.522986570845257e-07, + "loss": 0.3032, + "step": 6230 + }, + { + "epoch": 4.321081830790568, + "grad_norm": 0.4142321804738146, + "learning_rate": 5.511932623087163e-07, + "loss": 0.3256, + "step": 6231 + }, + { + "epoch": 4.321775312066574, + "grad_norm": 0.38731523768753373, + "learning_rate": 5.500889103009855e-07, + "loss": 0.3356, + "step": 6232 + }, + { + "epoch": 4.32246879334258, + "grad_norm": 0.4440993313107543, + "learning_rate": 5.489856013201872e-07, + "loss": 0.4083, + "step": 6233 + }, + { + "epoch": 4.323162274618586, + "grad_norm": 0.40240024547632747, + "learning_rate": 5.478833356249274e-07, + "loss": 0.3223, + "step": 6234 + }, + { + "epoch": 4.3238557558945905, + "grad_norm": 0.39262817711177267, + "learning_rate": 5.467821134735701e-07, + "loss": 0.3742, + "step": 6235 + }, + { + "epoch": 4.324549237170596, + "grad_norm": 0.408222550882554, + "learning_rate": 5.456819351242349e-07, + "loss": 0.3413, + "step": 6236 + }, + { + "epoch": 4.325242718446602, + "grad_norm": 0.456644835750475, + "learning_rate": 5.445828008347925e-07, + "loss": 0.3547, + "step": 6237 + }, + { + "epoch": 4.325936199722608, + "grad_norm": 0.9994355490037153, + "learning_rate": 5.434847108628749e-07, + "loss": 0.3583, + "step": 6238 + }, + { + "epoch": 4.326629680998613, + "grad_norm": 0.4090478902096986, + "learning_rate": 5.42387665465865e-07, + "loss": 0.3516, + "step": 6239 + }, + { + "epoch": 4.327323162274618, + "grad_norm": 0.3891216014127711, + "learning_rate": 5.412916649009026e-07, + "loss": 0.3685, + "step": 6240 + }, + { + "epoch": 4.328016643550624, + "grad_norm": 0.4146122313921773, + "learning_rate": 5.401967094248839e-07, + "loss": 0.3381, + "step": 6241 + }, + { + "epoch": 4.32871012482663, + "grad_norm": 0.4328714296396735, + "learning_rate": 5.391027992944559e-07, + "loss": 0.3898, + "step": 6242 + }, + { + "epoch": 4.329403606102635, + "grad_norm": 0.428828804359781, + "learning_rate": 5.380099347660245e-07, + "loss": 0.3412, + "step": 6243 + }, + { + "epoch": 4.330097087378641, + "grad_norm": 0.4038367750406883, + "learning_rate": 5.369181160957498e-07, + "loss": 0.3394, + "step": 6244 + }, + { + "epoch": 4.330790568654646, + "grad_norm": 0.4331393800229408, + "learning_rate": 5.358273435395451e-07, + "loss": 0.3666, + "step": 6245 + }, + { + "epoch": 4.331484049930652, + "grad_norm": 0.41032139653484206, + "learning_rate": 5.3473761735308e-07, + "loss": 0.3314, + "step": 6246 + }, + { + "epoch": 4.332177531206657, + "grad_norm": 0.4241927325262719, + "learning_rate": 5.336489377917786e-07, + "loss": 0.3606, + "step": 6247 + }, + { + "epoch": 4.332871012482663, + "grad_norm": 0.42256931019101407, + "learning_rate": 5.325613051108181e-07, + "loss": 0.3651, + "step": 6248 + }, + { + "epoch": 4.333564493758669, + "grad_norm": 0.4423421216341831, + "learning_rate": 5.314747195651349e-07, + "loss": 0.3193, + "step": 6249 + }, + { + "epoch": 4.334257975034674, + "grad_norm": 0.7089264750675331, + "learning_rate": 5.303891814094137e-07, + "loss": 0.3419, + "step": 6250 + }, + { + "epoch": 4.334951456310679, + "grad_norm": 0.3979645535749209, + "learning_rate": 5.293046908980982e-07, + "loss": 0.3716, + "step": 6251 + }, + { + "epoch": 4.335644937586685, + "grad_norm": 0.4140966129946514, + "learning_rate": 5.282212482853855e-07, + "loss": 0.349, + "step": 6252 + }, + { + "epoch": 4.336338418862691, + "grad_norm": 0.4322214711878763, + "learning_rate": 5.271388538252254e-07, + "loss": 0.3773, + "step": 6253 + }, + { + "epoch": 4.3370319001386965, + "grad_norm": 0.46375529184788306, + "learning_rate": 5.260575077713237e-07, + "loss": 0.3685, + "step": 6254 + }, + { + "epoch": 4.337725381414701, + "grad_norm": 0.5093682290616093, + "learning_rate": 5.249772103771411e-07, + "loss": 0.3539, + "step": 6255 + }, + { + "epoch": 4.338418862690707, + "grad_norm": 0.4151991185971279, + "learning_rate": 5.2389796189589e-07, + "loss": 0.3256, + "step": 6256 + }, + { + "epoch": 4.339112343966713, + "grad_norm": 0.4876112463290477, + "learning_rate": 5.228197625805392e-07, + "loss": 0.3727, + "step": 6257 + }, + { + "epoch": 4.339805825242719, + "grad_norm": 0.44803030415871914, + "learning_rate": 5.21742612683811e-07, + "loss": 0.3829, + "step": 6258 + }, + { + "epoch": 4.340499306518724, + "grad_norm": 0.4020862925962174, + "learning_rate": 5.206665124581811e-07, + "loss": 0.3804, + "step": 6259 + }, + { + "epoch": 4.341192787794729, + "grad_norm": 0.4247183562690849, + "learning_rate": 5.195914621558812e-07, + "loss": 0.3645, + "step": 6260 + }, + { + "epoch": 4.341886269070735, + "grad_norm": 0.45851218777202424, + "learning_rate": 5.185174620288924e-07, + "loss": 0.3938, + "step": 6261 + }, + { + "epoch": 4.342579750346741, + "grad_norm": 0.43178059180305894, + "learning_rate": 5.174445123289546e-07, + "loss": 0.3288, + "step": 6262 + }, + { + "epoch": 4.343273231622746, + "grad_norm": 0.3941526023398659, + "learning_rate": 5.163726133075597e-07, + "loss": 0.3337, + "step": 6263 + }, + { + "epoch": 4.343966712898752, + "grad_norm": 1.1298319852933496, + "learning_rate": 5.153017652159509e-07, + "loss": 0.3462, + "step": 6264 + }, + { + "epoch": 4.344660194174757, + "grad_norm": 0.3863566658702163, + "learning_rate": 5.1423196830513e-07, + "loss": 0.3548, + "step": 6265 + }, + { + "epoch": 4.345353675450763, + "grad_norm": 0.389988149443278, + "learning_rate": 5.131632228258459e-07, + "loss": 0.3556, + "step": 6266 + }, + { + "epoch": 4.346047156726768, + "grad_norm": 0.4067820454890897, + "learning_rate": 5.120955290286089e-07, + "loss": 0.362, + "step": 6267 + }, + { + "epoch": 4.346740638002774, + "grad_norm": 0.4314836365529451, + "learning_rate": 5.110288871636776e-07, + "loss": 0.3869, + "step": 6268 + }, + { + "epoch": 4.3474341192787795, + "grad_norm": 0.4523265486060939, + "learning_rate": 5.099632974810631e-07, + "loss": 0.3537, + "step": 6269 + }, + { + "epoch": 4.348127600554785, + "grad_norm": 0.4452057505688693, + "learning_rate": 5.088987602305351e-07, + "loss": 0.3421, + "step": 6270 + }, + { + "epoch": 4.34882108183079, + "grad_norm": 0.41812913581316336, + "learning_rate": 5.0783527566161e-07, + "loss": 0.3743, + "step": 6271 + }, + { + "epoch": 4.349514563106796, + "grad_norm": 0.36653016088480334, + "learning_rate": 5.067728440235626e-07, + "loss": 0.3179, + "step": 6272 + }, + { + "epoch": 4.350208044382802, + "grad_norm": 0.4147025993998611, + "learning_rate": 5.0571146556542e-07, + "loss": 0.3377, + "step": 6273 + }, + { + "epoch": 4.3509015256588075, + "grad_norm": 0.4269001973458045, + "learning_rate": 5.0465114053596e-07, + "loss": 0.35, + "step": 6274 + }, + { + "epoch": 4.351595006934812, + "grad_norm": 0.39427856589738997, + "learning_rate": 5.035918691837155e-07, + "loss": 0.376, + "step": 6275 + }, + { + "epoch": 4.352288488210818, + "grad_norm": 0.3995736149075299, + "learning_rate": 5.025336517569723e-07, + "loss": 0.3358, + "step": 6276 + }, + { + "epoch": 4.352981969486824, + "grad_norm": 0.45975735789614397, + "learning_rate": 5.014764885037693e-07, + "loss": 0.3249, + "step": 6277 + }, + { + "epoch": 4.35367545076283, + "grad_norm": 0.3900761533925085, + "learning_rate": 5.004203796718987e-07, + "loss": 0.3369, + "step": 6278 + }, + { + "epoch": 4.354368932038835, + "grad_norm": 0.41847197456481755, + "learning_rate": 4.993653255089021e-07, + "loss": 0.328, + "step": 6279 + }, + { + "epoch": 4.35506241331484, + "grad_norm": 0.46397746855439015, + "learning_rate": 4.983113262620781e-07, + "loss": 0.3395, + "step": 6280 + }, + { + "epoch": 4.355755894590846, + "grad_norm": 0.4085685443900029, + "learning_rate": 4.972583821784777e-07, + "loss": 0.3839, + "step": 6281 + }, + { + "epoch": 4.356449375866852, + "grad_norm": 0.4094690357682738, + "learning_rate": 4.962064935049016e-07, + "loss": 0.3914, + "step": 6282 + }, + { + "epoch": 4.357142857142857, + "grad_norm": 0.4166281529519862, + "learning_rate": 4.951556604879049e-07, + "loss": 0.3212, + "step": 6283 + }, + { + "epoch": 4.3578363384188625, + "grad_norm": 0.4490450622483123, + "learning_rate": 4.941058833737956e-07, + "loss": 0.3723, + "step": 6284 + }, + { + "epoch": 4.358529819694868, + "grad_norm": 0.41668814494773443, + "learning_rate": 4.930571624086339e-07, + "loss": 0.3753, + "step": 6285 + }, + { + "epoch": 4.359223300970874, + "grad_norm": 0.3950256038203281, + "learning_rate": 4.920094978382339e-07, + "loss": 0.3274, + "step": 6286 + }, + { + "epoch": 4.359916782246879, + "grad_norm": 0.3805846131438291, + "learning_rate": 4.909628899081581e-07, + "loss": 0.3214, + "step": 6287 + }, + { + "epoch": 4.360610263522885, + "grad_norm": 0.38900389826563797, + "learning_rate": 4.899173388637252e-07, + "loss": 0.3505, + "step": 6288 + }, + { + "epoch": 4.3613037447988905, + "grad_norm": 0.39999103900945704, + "learning_rate": 4.888728449500052e-07, + "loss": 0.3641, + "step": 6289 + }, + { + "epoch": 4.361997226074896, + "grad_norm": 0.40785582825012384, + "learning_rate": 4.878294084118185e-07, + "loss": 0.3523, + "step": 6290 + }, + { + "epoch": 4.362690707350901, + "grad_norm": 0.428957856667886, + "learning_rate": 4.867870294937393e-07, + "loss": 0.362, + "step": 6291 + }, + { + "epoch": 4.363384188626907, + "grad_norm": 0.39429097731269275, + "learning_rate": 4.857457084400957e-07, + "loss": 0.3558, + "step": 6292 + }, + { + "epoch": 4.364077669902913, + "grad_norm": 0.43027300052080414, + "learning_rate": 4.847054454949617e-07, + "loss": 0.3165, + "step": 6293 + }, + { + "epoch": 4.3647711511789185, + "grad_norm": 0.4001717709838788, + "learning_rate": 4.836662409021725e-07, + "loss": 0.3794, + "step": 6294 + }, + { + "epoch": 4.365464632454923, + "grad_norm": 0.38825690155737724, + "learning_rate": 4.826280949053064e-07, + "loss": 0.3565, + "step": 6295 + }, + { + "epoch": 4.366158113730929, + "grad_norm": 0.41986818401117687, + "learning_rate": 4.815910077476987e-07, + "loss": 0.403, + "step": 6296 + }, + { + "epoch": 4.366851595006935, + "grad_norm": 0.40056475962480215, + "learning_rate": 4.805549796724357e-07, + "loss": 0.338, + "step": 6297 + }, + { + "epoch": 4.367545076282941, + "grad_norm": 0.41571163317843407, + "learning_rate": 4.79520010922353e-07, + "loss": 0.3623, + "step": 6298 + }, + { + "epoch": 4.3682385575589455, + "grad_norm": 0.3724373986559259, + "learning_rate": 4.784861017400411e-07, + "loss": 0.3194, + "step": 6299 + }, + { + "epoch": 4.368932038834951, + "grad_norm": 0.5453313923600107, + "learning_rate": 4.774532523678415e-07, + "loss": 0.3037, + "step": 6300 + }, + { + "epoch": 4.369625520110957, + "grad_norm": 0.43137188826371686, + "learning_rate": 4.7642146304784475e-07, + "loss": 0.3859, + "step": 6301 + }, + { + "epoch": 4.370319001386963, + "grad_norm": 0.38940894932145365, + "learning_rate": 4.7539073402189605e-07, + "loss": 0.3519, + "step": 6302 + }, + { + "epoch": 4.371012482662968, + "grad_norm": 0.390519751931627, + "learning_rate": 4.7436106553159e-07, + "loss": 0.3633, + "step": 6303 + }, + { + "epoch": 4.3717059639389735, + "grad_norm": 0.428762268328225, + "learning_rate": 4.7333245781827463e-07, + "loss": 0.388, + "step": 6304 + }, + { + "epoch": 4.372399445214979, + "grad_norm": 0.4331875010957346, + "learning_rate": 4.7230491112304767e-07, + "loss": 0.3477, + "step": 6305 + }, + { + "epoch": 4.373092926490985, + "grad_norm": 0.39582647322625164, + "learning_rate": 4.712784256867581e-07, + "loss": 0.3278, + "step": 6306 + }, + { + "epoch": 4.37378640776699, + "grad_norm": 0.63486828029795, + "learning_rate": 4.7025300175000675e-07, + "loss": 0.3886, + "step": 6307 + }, + { + "epoch": 4.374479889042996, + "grad_norm": 0.36852223068564477, + "learning_rate": 4.69228639553147e-07, + "loss": 0.3154, + "step": 6308 + }, + { + "epoch": 4.375173370319001, + "grad_norm": 0.39740566447596537, + "learning_rate": 4.6820533933627956e-07, + "loss": 0.2965, + "step": 6309 + }, + { + "epoch": 4.375866851595007, + "grad_norm": 0.4155349731984529, + "learning_rate": 4.6718310133926084e-07, + "loss": 0.3323, + "step": 6310 + }, + { + "epoch": 4.376560332871012, + "grad_norm": 0.46088054896957437, + "learning_rate": 4.6616192580169306e-07, + "loss": 0.3774, + "step": 6311 + }, + { + "epoch": 4.377253814147018, + "grad_norm": 0.3980571868353865, + "learning_rate": 4.651418129629348e-07, + "loss": 0.3279, + "step": 6312 + }, + { + "epoch": 4.377947295423024, + "grad_norm": 0.4400787696951905, + "learning_rate": 4.6412276306209426e-07, + "loss": 0.3894, + "step": 6313 + }, + { + "epoch": 4.378640776699029, + "grad_norm": 0.4315819008936016, + "learning_rate": 4.631047763380264e-07, + "loss": 0.3232, + "step": 6314 + }, + { + "epoch": 4.379334257975034, + "grad_norm": 0.4061097626261044, + "learning_rate": 4.620878530293421e-07, + "loss": 0.3594, + "step": 6315 + }, + { + "epoch": 4.38002773925104, + "grad_norm": 0.4617086676641591, + "learning_rate": 4.61071993374399e-07, + "loss": 0.3613, + "step": 6316 + }, + { + "epoch": 4.380721220527046, + "grad_norm": 0.4837472209010971, + "learning_rate": 4.600571976113083e-07, + "loss": 0.3306, + "step": 6317 + }, + { + "epoch": 4.381414701803052, + "grad_norm": 0.4266507892564871, + "learning_rate": 4.590434659779314e-07, + "loss": 0.3364, + "step": 6318 + }, + { + "epoch": 4.3821081830790565, + "grad_norm": 0.4145526426917289, + "learning_rate": 4.5803079871187816e-07, + "loss": 0.3561, + "step": 6319 + }, + { + "epoch": 4.382801664355062, + "grad_norm": 0.585601135435787, + "learning_rate": 4.570191960505116e-07, + "loss": 0.3438, + "step": 6320 + }, + { + "epoch": 4.383495145631068, + "grad_norm": 0.5158600867218267, + "learning_rate": 4.560086582309431e-07, + "loss": 0.3316, + "step": 6321 + }, + { + "epoch": 4.384188626907074, + "grad_norm": 0.3885637940829561, + "learning_rate": 4.54999185490036e-07, + "loss": 0.3029, + "step": 6322 + }, + { + "epoch": 4.384882108183079, + "grad_norm": 0.40677268320065696, + "learning_rate": 4.5399077806440486e-07, + "loss": 0.2962, + "step": 6323 + }, + { + "epoch": 4.385575589459084, + "grad_norm": 0.4795919028533219, + "learning_rate": 4.529834361904101e-07, + "loss": 0.3882, + "step": 6324 + }, + { + "epoch": 4.38626907073509, + "grad_norm": 0.4015692080918974, + "learning_rate": 4.5197716010416723e-07, + "loss": 0.3519, + "step": 6325 + }, + { + "epoch": 4.386962552011096, + "grad_norm": 0.4019314645164274, + "learning_rate": 4.509719500415405e-07, + "loss": 0.3665, + "step": 6326 + }, + { + "epoch": 4.387656033287101, + "grad_norm": 0.4176820631419, + "learning_rate": 4.4996780623814186e-07, + "loss": 0.3473, + "step": 6327 + }, + { + "epoch": 4.388349514563107, + "grad_norm": 0.43013904897295463, + "learning_rate": 4.4896472892933693e-07, + "loss": 0.3386, + "step": 6328 + }, + { + "epoch": 4.389042995839112, + "grad_norm": 0.3700295995202613, + "learning_rate": 4.479627183502394e-07, + "loss": 0.3298, + "step": 6329 + }, + { + "epoch": 4.389736477115118, + "grad_norm": 0.3864971972567878, + "learning_rate": 4.46961774735713e-07, + "loss": 0.3558, + "step": 6330 + }, + { + "epoch": 4.390429958391123, + "grad_norm": 0.4215785042564987, + "learning_rate": 4.4596189832037286e-07, + "loss": 0.394, + "step": 6331 + }, + { + "epoch": 4.391123439667129, + "grad_norm": 0.41938505377725205, + "learning_rate": 4.449630893385809e-07, + "loss": 0.3625, + "step": 6332 + }, + { + "epoch": 4.391816920943135, + "grad_norm": 0.3759323573464562, + "learning_rate": 4.4396534802445213e-07, + "loss": 0.3392, + "step": 6333 + }, + { + "epoch": 4.39251040221914, + "grad_norm": 0.7048532444104139, + "learning_rate": 4.4296867461185e-07, + "loss": 0.3742, + "step": 6334 + }, + { + "epoch": 4.393203883495145, + "grad_norm": 0.5408104347465381, + "learning_rate": 4.41973069334386e-07, + "loss": 0.3372, + "step": 6335 + }, + { + "epoch": 4.393897364771151, + "grad_norm": 0.46993520360254293, + "learning_rate": 4.40978532425424e-07, + "loss": 0.3582, + "step": 6336 + }, + { + "epoch": 4.394590846047157, + "grad_norm": 0.4428729197009111, + "learning_rate": 4.39985064118077e-07, + "loss": 0.3481, + "step": 6337 + }, + { + "epoch": 4.3952843273231625, + "grad_norm": 0.41780682419165016, + "learning_rate": 4.3899266464520365e-07, + "loss": 0.3481, + "step": 6338 + }, + { + "epoch": 4.395977808599167, + "grad_norm": 0.43262509885222333, + "learning_rate": 4.380013342394196e-07, + "loss": 0.3639, + "step": 6339 + }, + { + "epoch": 4.396671289875173, + "grad_norm": 0.4595878985194084, + "learning_rate": 4.370110731330818e-07, + "loss": 0.3621, + "step": 6340 + }, + { + "epoch": 4.397364771151179, + "grad_norm": 0.45021919353226675, + "learning_rate": 4.360218815583023e-07, + "loss": 0.3852, + "step": 6341 + }, + { + "epoch": 4.398058252427185, + "grad_norm": 0.5048590140254069, + "learning_rate": 4.3503375974694063e-07, + "loss": 0.354, + "step": 6342 + }, + { + "epoch": 4.39875173370319, + "grad_norm": 0.41822513041651366, + "learning_rate": 4.340467079306032e-07, + "loss": 0.363, + "step": 6343 + }, + { + "epoch": 4.399445214979195, + "grad_norm": 0.43747501097360025, + "learning_rate": 4.3306072634065e-07, + "loss": 0.3561, + "step": 6344 + }, + { + "epoch": 4.400138696255201, + "grad_norm": 0.4075033166791627, + "learning_rate": 4.3207581520818773e-07, + "loss": 0.35, + "step": 6345 + }, + { + "epoch": 4.400832177531207, + "grad_norm": 0.41544014884749425, + "learning_rate": 4.310919747640707e-07, + "loss": 0.3249, + "step": 6346 + }, + { + "epoch": 4.401525658807212, + "grad_norm": 0.40260693254121643, + "learning_rate": 4.3010920523890554e-07, + "loss": 0.4032, + "step": 6347 + }, + { + "epoch": 4.402219140083218, + "grad_norm": 0.372218437013559, + "learning_rate": 4.2912750686304625e-07, + "loss": 0.3415, + "step": 6348 + }, + { + "epoch": 4.402912621359223, + "grad_norm": 0.3931426152441145, + "learning_rate": 4.28146879866595e-07, + "loss": 0.3326, + "step": 6349 + }, + { + "epoch": 4.403606102635229, + "grad_norm": 0.38945490066830396, + "learning_rate": 4.271673244794056e-07, + "loss": 0.3189, + "step": 6350 + }, + { + "epoch": 4.404299583911234, + "grad_norm": 0.3827032181801738, + "learning_rate": 4.2618884093107604e-07, + "loss": 0.3451, + "step": 6351 + }, + { + "epoch": 4.40499306518724, + "grad_norm": 0.39567281835561535, + "learning_rate": 4.252114294509574e-07, + "loss": 0.3671, + "step": 6352 + }, + { + "epoch": 4.4056865464632455, + "grad_norm": 0.4114214376501773, + "learning_rate": 4.24235090268148e-07, + "loss": 0.3467, + "step": 6353 + }, + { + "epoch": 4.406380027739251, + "grad_norm": 0.4301141111183747, + "learning_rate": 4.2325982361149377e-07, + "loss": 0.4031, + "step": 6354 + }, + { + "epoch": 4.407073509015256, + "grad_norm": 0.4556807307871837, + "learning_rate": 4.222856297095912e-07, + "loss": 0.3503, + "step": 6355 + }, + { + "epoch": 4.407766990291262, + "grad_norm": 0.3722555998423781, + "learning_rate": 4.213125087907821e-07, + "loss": 0.3239, + "step": 6356 + }, + { + "epoch": 4.408460471567268, + "grad_norm": 0.41884999361999586, + "learning_rate": 4.2034046108316127e-07, + "loss": 0.3665, + "step": 6357 + }, + { + "epoch": 4.4091539528432735, + "grad_norm": 0.42194226562584725, + "learning_rate": 4.193694868145698e-07, + "loss": 0.3146, + "step": 6358 + }, + { + "epoch": 4.409847434119278, + "grad_norm": 0.4169510229784901, + "learning_rate": 4.18399586212595e-07, + "loss": 0.3199, + "step": 6359 + }, + { + "epoch": 4.410540915395284, + "grad_norm": 0.4398902848685013, + "learning_rate": 4.174307595045768e-07, + "loss": 0.386, + "step": 6360 + }, + { + "epoch": 4.41123439667129, + "grad_norm": 0.6713588084844746, + "learning_rate": 4.16463006917599e-07, + "loss": 0.3217, + "step": 6361 + }, + { + "epoch": 4.411927877947296, + "grad_norm": 0.49245107242596364, + "learning_rate": 4.154963286784969e-07, + "loss": 0.3238, + "step": 6362 + }, + { + "epoch": 4.412621359223301, + "grad_norm": 0.41633481101546227, + "learning_rate": 4.1453072501385415e-07, + "loss": 0.3444, + "step": 6363 + }, + { + "epoch": 4.413314840499306, + "grad_norm": 0.41190021325085757, + "learning_rate": 4.135661961499987e-07, + "loss": 0.3446, + "step": 6364 + }, + { + "epoch": 4.414008321775312, + "grad_norm": 0.41880141955352584, + "learning_rate": 4.1260274231301025e-07, + "loss": 0.339, + "step": 6365 + }, + { + "epoch": 4.414701803051318, + "grad_norm": 0.4247633183013577, + "learning_rate": 4.116403637287153e-07, + "loss": 0.3349, + "step": 6366 + }, + { + "epoch": 4.415395284327323, + "grad_norm": 0.4101174101082708, + "learning_rate": 4.10679060622689e-07, + "loss": 0.3779, + "step": 6367 + }, + { + "epoch": 4.4160887656033285, + "grad_norm": 0.4493975809531821, + "learning_rate": 4.097188332202545e-07, + "loss": 0.3405, + "step": 6368 + }, + { + "epoch": 4.416782246879334, + "grad_norm": 0.6886349458825491, + "learning_rate": 4.0875968174648005e-07, + "loss": 0.3873, + "step": 6369 + }, + { + "epoch": 4.41747572815534, + "grad_norm": 0.5516427166935733, + "learning_rate": 4.078016064261847e-07, + "loss": 0.3898, + "step": 6370 + }, + { + "epoch": 4.418169209431345, + "grad_norm": 0.51104124248911, + "learning_rate": 4.068446074839355e-07, + "loss": 0.3698, + "step": 6371 + }, + { + "epoch": 4.418862690707351, + "grad_norm": 0.37054836093861715, + "learning_rate": 4.0588868514404466e-07, + "loss": 0.3271, + "step": 6372 + }, + { + "epoch": 4.4195561719833565, + "grad_norm": 0.43217439156912235, + "learning_rate": 4.0493383963057354e-07, + "loss": 0.359, + "step": 6373 + }, + { + "epoch": 4.420249653259362, + "grad_norm": 0.41123167200583505, + "learning_rate": 4.039800711673314e-07, + "loss": 0.3375, + "step": 6374 + }, + { + "epoch": 4.420943134535367, + "grad_norm": 0.4092117836816686, + "learning_rate": 4.0302737997787444e-07, + "loss": 0.3707, + "step": 6375 + }, + { + "epoch": 4.421636615811373, + "grad_norm": 0.3971762370452769, + "learning_rate": 4.020757662855079e-07, + "loss": 0.3241, + "step": 6376 + }, + { + "epoch": 4.422330097087379, + "grad_norm": 0.4402934915902368, + "learning_rate": 4.011252303132812e-07, + "loss": 0.3781, + "step": 6377 + }, + { + "epoch": 4.4230235783633844, + "grad_norm": 0.4416151158821577, + "learning_rate": 4.0017577228399383e-07, + "loss": 0.3152, + "step": 6378 + }, + { + "epoch": 4.423717059639389, + "grad_norm": 0.4063817727030474, + "learning_rate": 3.992273924201928e-07, + "loss": 0.3431, + "step": 6379 + }, + { + "epoch": 4.424410540915395, + "grad_norm": 0.4167749691745393, + "learning_rate": 3.9828009094416973e-07, + "loss": 0.4004, + "step": 6380 + }, + { + "epoch": 4.425104022191401, + "grad_norm": 0.42586220669925484, + "learning_rate": 3.973338680779659e-07, + "loss": 0.3854, + "step": 6381 + }, + { + "epoch": 4.425797503467407, + "grad_norm": 0.36676096467382063, + "learning_rate": 3.9638872404337057e-07, + "loss": 0.2959, + "step": 6382 + }, + { + "epoch": 4.4264909847434115, + "grad_norm": 0.4305992075376699, + "learning_rate": 3.954446590619154e-07, + "loss": 0.4079, + "step": 6383 + }, + { + "epoch": 4.427184466019417, + "grad_norm": 0.381184772365284, + "learning_rate": 3.945016733548862e-07, + "loss": 0.3176, + "step": 6384 + }, + { + "epoch": 4.427877947295423, + "grad_norm": 0.40309557690038145, + "learning_rate": 3.9355976714330944e-07, + "loss": 0.3663, + "step": 6385 + }, + { + "epoch": 4.428571428571429, + "grad_norm": 0.3802601095632871, + "learning_rate": 3.9261894064796136e-07, + "loss": 0.321, + "step": 6386 + }, + { + "epoch": 4.429264909847434, + "grad_norm": 0.4167808411994532, + "learning_rate": 3.916791940893666e-07, + "loss": 0.3352, + "step": 6387 + }, + { + "epoch": 4.4299583911234395, + "grad_norm": 0.3984340665888154, + "learning_rate": 3.907405276877929e-07, + "loss": 0.3503, + "step": 6388 + }, + { + "epoch": 4.430651872399445, + "grad_norm": 0.4483459886217029, + "learning_rate": 3.898029416632582e-07, + "loss": 0.331, + "step": 6389 + }, + { + "epoch": 4.431345353675451, + "grad_norm": 0.5590900519385273, + "learning_rate": 3.8886643623552545e-07, + "loss": 0.3229, + "step": 6390 + }, + { + "epoch": 4.432038834951456, + "grad_norm": 0.41040463424095275, + "learning_rate": 3.8793101162410417e-07, + "loss": 0.3417, + "step": 6391 + }, + { + "epoch": 4.432732316227462, + "grad_norm": 0.41344522315356363, + "learning_rate": 3.869966680482512e-07, + "loss": 0.3499, + "step": 6392 + }, + { + "epoch": 4.433425797503467, + "grad_norm": 0.3809315116967289, + "learning_rate": 3.8606340572697076e-07, + "loss": 0.3489, + "step": 6393 + }, + { + "epoch": 4.434119278779473, + "grad_norm": 0.40744983573471544, + "learning_rate": 3.851312248790118e-07, + "loss": 0.3704, + "step": 6394 + }, + { + "epoch": 4.434812760055478, + "grad_norm": 0.4324191308332644, + "learning_rate": 3.842001257228728e-07, + "loss": 0.3616, + "step": 6395 + }, + { + "epoch": 4.435506241331484, + "grad_norm": 0.6043370531779009, + "learning_rate": 3.8327010847679367e-07, + "loss": 0.3609, + "step": 6396 + }, + { + "epoch": 4.43619972260749, + "grad_norm": 0.42549559985897084, + "learning_rate": 3.823411733587662e-07, + "loss": 0.3429, + "step": 6397 + }, + { + "epoch": 4.436893203883495, + "grad_norm": 0.3899977164637308, + "learning_rate": 3.8141332058652447e-07, + "loss": 0.3439, + "step": 6398 + }, + { + "epoch": 4.4375866851595, + "grad_norm": 0.39979565484471935, + "learning_rate": 3.8048655037755066e-07, + "loss": 0.3417, + "step": 6399 + }, + { + "epoch": 4.438280166435506, + "grad_norm": 0.4039727325465911, + "learning_rate": 3.795608629490738e-07, + "loss": 0.3725, + "step": 6400 + }, + { + "epoch": 4.438973647711512, + "grad_norm": 0.3902090034425511, + "learning_rate": 3.786362585180675e-07, + "loss": 0.3321, + "step": 6401 + }, + { + "epoch": 4.439667128987518, + "grad_norm": 0.4355666390335971, + "learning_rate": 3.777127373012529e-07, + "loss": 0.322, + "step": 6402 + }, + { + "epoch": 4.440360610263523, + "grad_norm": 0.4304007853870166, + "learning_rate": 3.7679029951509736e-07, + "loss": 0.3883, + "step": 6403 + }, + { + "epoch": 4.441054091539528, + "grad_norm": 0.40025749412819384, + "learning_rate": 3.7586894537581187e-07, + "loss": 0.3787, + "step": 6404 + }, + { + "epoch": 4.441747572815534, + "grad_norm": 0.3838135413625623, + "learning_rate": 3.749486750993564e-07, + "loss": 0.336, + "step": 6405 + }, + { + "epoch": 4.44244105409154, + "grad_norm": 0.4402419987263163, + "learning_rate": 3.740294889014351e-07, + "loss": 0.3689, + "step": 6406 + }, + { + "epoch": 4.443134535367545, + "grad_norm": 0.4712223960422006, + "learning_rate": 3.731113869974984e-07, + "loss": 0.3576, + "step": 6407 + }, + { + "epoch": 4.44382801664355, + "grad_norm": 0.37961848140922916, + "learning_rate": 3.721943696027441e-07, + "loss": 0.343, + "step": 6408 + }, + { + "epoch": 4.444521497919556, + "grad_norm": 0.4261017222835085, + "learning_rate": 3.712784369321121e-07, + "loss": 0.3271, + "step": 6409 + }, + { + "epoch": 4.445214979195562, + "grad_norm": 0.3955025959090337, + "learning_rate": 3.703635892002927e-07, + "loss": 0.3629, + "step": 6410 + }, + { + "epoch": 4.445908460471568, + "grad_norm": 0.4303594210927355, + "learning_rate": 3.694498266217178e-07, + "loss": 0.3559, + "step": 6411 + }, + { + "epoch": 4.446601941747573, + "grad_norm": 0.3922912554410651, + "learning_rate": 3.685371494105683e-07, + "loss": 0.3514, + "step": 6412 + }, + { + "epoch": 4.447295423023578, + "grad_norm": 0.39983174854623693, + "learning_rate": 3.676255577807686e-07, + "loss": 0.3511, + "step": 6413 + }, + { + "epoch": 4.447988904299584, + "grad_norm": 0.4365924653744624, + "learning_rate": 3.6671505194598777e-07, + "loss": 0.3367, + "step": 6414 + }, + { + "epoch": 4.448682385575589, + "grad_norm": 0.4256680423167934, + "learning_rate": 3.6580563211964346e-07, + "loss": 0.3551, + "step": 6415 + }, + { + "epoch": 4.449375866851595, + "grad_norm": 0.41180963080396754, + "learning_rate": 3.648972985148974e-07, + "loss": 0.3616, + "step": 6416 + }, + { + "epoch": 4.450069348127601, + "grad_norm": 0.40246046167541016, + "learning_rate": 3.6399005134465426e-07, + "loss": 0.3099, + "step": 6417 + }, + { + "epoch": 4.450762829403606, + "grad_norm": 0.39309268381047585, + "learning_rate": 3.6308389082156835e-07, + "loss": 0.3318, + "step": 6418 + }, + { + "epoch": 4.451456310679612, + "grad_norm": 1.3650711117186602, + "learning_rate": 3.6217881715803536e-07, + "loss": 0.3614, + "step": 6419 + }, + { + "epoch": 4.452149791955617, + "grad_norm": 0.3953208802560284, + "learning_rate": 3.612748305661995e-07, + "loss": 0.3515, + "step": 6420 + }, + { + "epoch": 4.452843273231623, + "grad_norm": 0.4014098797996639, + "learning_rate": 3.60371931257949e-07, + "loss": 0.3498, + "step": 6421 + }, + { + "epoch": 4.4535367545076285, + "grad_norm": 0.43947384328906647, + "learning_rate": 3.5947011944491516e-07, + "loss": 0.3052, + "step": 6422 + }, + { + "epoch": 4.454230235783633, + "grad_norm": 0.5383784922701661, + "learning_rate": 3.585693953384767e-07, + "loss": 0.3467, + "step": 6423 + }, + { + "epoch": 4.454923717059639, + "grad_norm": 0.3988991932539132, + "learning_rate": 3.576697591497585e-07, + "loss": 0.3618, + "step": 6424 + }, + { + "epoch": 4.455617198335645, + "grad_norm": 0.3606107553496361, + "learning_rate": 3.5677121108962655e-07, + "loss": 0.2923, + "step": 6425 + }, + { + "epoch": 4.456310679611651, + "grad_norm": 0.4249631383075413, + "learning_rate": 3.558737513686944e-07, + "loss": 0.3456, + "step": 6426 + }, + { + "epoch": 4.4570041608876565, + "grad_norm": 0.429837817751984, + "learning_rate": 3.549773801973211e-07, + "loss": 0.3941, + "step": 6427 + }, + { + "epoch": 4.457697642163661, + "grad_norm": 0.48755067157371196, + "learning_rate": 3.5408209778560854e-07, + "loss": 0.3559, + "step": 6428 + }, + { + "epoch": 4.458391123439667, + "grad_norm": 0.37875143764677704, + "learning_rate": 3.5318790434340613e-07, + "loss": 0.3185, + "step": 6429 + }, + { + "epoch": 4.459084604715673, + "grad_norm": 0.41411150489536086, + "learning_rate": 3.5229480008030395e-07, + "loss": 0.3629, + "step": 6430 + }, + { + "epoch": 4.459778085991678, + "grad_norm": 0.4039513018740434, + "learning_rate": 3.514027852056406e-07, + "loss": 0.3518, + "step": 6431 + }, + { + "epoch": 4.460471567267684, + "grad_norm": 0.4165462965731385, + "learning_rate": 3.50511859928499e-07, + "loss": 0.3861, + "step": 6432 + }, + { + "epoch": 4.461165048543689, + "grad_norm": 0.3965463802401738, + "learning_rate": 3.496220244577025e-07, + "loss": 0.3054, + "step": 6433 + }, + { + "epoch": 4.461858529819695, + "grad_norm": 0.4688123280454221, + "learning_rate": 3.487332790018244e-07, + "loss": 0.357, + "step": 6434 + }, + { + "epoch": 4.462552011095701, + "grad_norm": 0.40119468223791616, + "learning_rate": 3.4784562376918076e-07, + "loss": 0.3409, + "step": 6435 + }, + { + "epoch": 4.463245492371706, + "grad_norm": 0.46370427307719886, + "learning_rate": 3.469590589678284e-07, + "loss": 0.3669, + "step": 6436 + }, + { + "epoch": 4.4639389736477115, + "grad_norm": 0.4033241157687948, + "learning_rate": 3.460735848055752e-07, + "loss": 0.3143, + "step": 6437 + }, + { + "epoch": 4.464632454923717, + "grad_norm": 0.40705754510496184, + "learning_rate": 3.451892014899677e-07, + "loss": 0.3403, + "step": 6438 + }, + { + "epoch": 4.465325936199722, + "grad_norm": 0.41357876947975447, + "learning_rate": 3.4430590922829965e-07, + "loss": 0.3331, + "step": 6439 + }, + { + "epoch": 4.466019417475728, + "grad_norm": 0.3954692268157122, + "learning_rate": 3.434237082276093e-07, + "loss": 0.3614, + "step": 6440 + }, + { + "epoch": 4.466712898751734, + "grad_norm": 0.3760812585813724, + "learning_rate": 3.4254259869467623e-07, + "loss": 0.3235, + "step": 6441 + }, + { + "epoch": 4.4674063800277395, + "grad_norm": 0.40449653513926387, + "learning_rate": 3.4166258083602797e-07, + "loss": 0.3849, + "step": 6442 + }, + { + "epoch": 4.468099861303745, + "grad_norm": 0.40914052503140225, + "learning_rate": 3.4078365485793297e-07, + "loss": 0.3773, + "step": 6443 + }, + { + "epoch": 4.46879334257975, + "grad_norm": 0.4419044690559538, + "learning_rate": 3.3990582096640526e-07, + "loss": 0.3464, + "step": 6444 + }, + { + "epoch": 4.469486823855756, + "grad_norm": 0.4085625947232388, + "learning_rate": 3.3902907936720353e-07, + "loss": 0.3663, + "step": 6445 + }, + { + "epoch": 4.470180305131762, + "grad_norm": 0.3936127333601739, + "learning_rate": 3.38153430265829e-07, + "loss": 0.3406, + "step": 6446 + }, + { + "epoch": 4.470873786407767, + "grad_norm": 0.40031214701874424, + "learning_rate": 3.3727887386752866e-07, + "loss": 0.3489, + "step": 6447 + }, + { + "epoch": 4.471567267683772, + "grad_norm": 0.4553388947830274, + "learning_rate": 3.3640541037729013e-07, + "loss": 0.3509, + "step": 6448 + }, + { + "epoch": 4.472260748959778, + "grad_norm": 0.44325214353034015, + "learning_rate": 3.3553303999984854e-07, + "loss": 0.3548, + "step": 6449 + }, + { + "epoch": 4.472954230235784, + "grad_norm": 0.4393914871143648, + "learning_rate": 3.3466176293968146e-07, + "loss": 0.3728, + "step": 6450 + }, + { + "epoch": 4.47364771151179, + "grad_norm": 0.4603045183143016, + "learning_rate": 3.3379157940100825e-07, + "loss": 0.3774, + "step": 6451 + }, + { + "epoch": 4.4743411927877945, + "grad_norm": 0.5124282463772938, + "learning_rate": 3.3292248958779414e-07, + "loss": 0.3278, + "step": 6452 + }, + { + "epoch": 4.4750346740638, + "grad_norm": 0.4107030441536405, + "learning_rate": 3.3205449370374955e-07, + "loss": 0.3506, + "step": 6453 + }, + { + "epoch": 4.475728155339806, + "grad_norm": 0.3894232902370426, + "learning_rate": 3.3118759195232273e-07, + "loss": 0.3618, + "step": 6454 + }, + { + "epoch": 4.476421636615811, + "grad_norm": 0.43683930384381864, + "learning_rate": 3.303217845367124e-07, + "loss": 0.3219, + "step": 6455 + }, + { + "epoch": 4.477115117891817, + "grad_norm": 0.39801986042785004, + "learning_rate": 3.294570716598561e-07, + "loss": 0.3405, + "step": 6456 + }, + { + "epoch": 4.4778085991678225, + "grad_norm": 0.3746218809626706, + "learning_rate": 3.2859345352443673e-07, + "loss": 0.3257, + "step": 6457 + }, + { + "epoch": 4.478502080443828, + "grad_norm": 0.4054198427002251, + "learning_rate": 3.2773093033288016e-07, + "loss": 0.3844, + "step": 6458 + }, + { + "epoch": 4.479195561719834, + "grad_norm": 0.4059728409327418, + "learning_rate": 3.2686950228735525e-07, + "loss": 0.3304, + "step": 6459 + }, + { + "epoch": 4.479889042995839, + "grad_norm": 0.4251068297741258, + "learning_rate": 3.2600916958977437e-07, + "loss": 0.3519, + "step": 6460 + }, + { + "epoch": 4.480582524271845, + "grad_norm": 0.4240131950128313, + "learning_rate": 3.2514993244179395e-07, + "loss": 0.3275, + "step": 6461 + }, + { + "epoch": 4.48127600554785, + "grad_norm": 0.6330401176140308, + "learning_rate": 3.242917910448118e-07, + "loss": 0.3408, + "step": 6462 + }, + { + "epoch": 4.481969486823855, + "grad_norm": 0.3881132677643724, + "learning_rate": 3.234347455999709e-07, + "loss": 0.3624, + "step": 6463 + }, + { + "epoch": 4.482662968099861, + "grad_norm": 0.36505614923981594, + "learning_rate": 3.2257879630815614e-07, + "loss": 0.3254, + "step": 6464 + }, + { + "epoch": 4.483356449375867, + "grad_norm": 0.5751774300457781, + "learning_rate": 3.2172394336999644e-07, + "loss": 0.3533, + "step": 6465 + }, + { + "epoch": 4.484049930651873, + "grad_norm": 1.4789054122146275, + "learning_rate": 3.2087018698586326e-07, + "loss": 0.3409, + "step": 6466 + }, + { + "epoch": 4.484743411927878, + "grad_norm": 0.42218696405004064, + "learning_rate": 3.200175273558698e-07, + "loss": 0.3794, + "step": 6467 + }, + { + "epoch": 4.485436893203883, + "grad_norm": 0.44298189214979666, + "learning_rate": 3.1916596467987395e-07, + "loss": 0.3632, + "step": 6468 + }, + { + "epoch": 4.486130374479889, + "grad_norm": 0.42006300798758667, + "learning_rate": 3.183154991574766e-07, + "loss": 0.3413, + "step": 6469 + }, + { + "epoch": 4.486823855755895, + "grad_norm": 0.38498342351820186, + "learning_rate": 3.174661309880189e-07, + "loss": 0.3429, + "step": 6470 + }, + { + "epoch": 4.4875173370319, + "grad_norm": 0.4217689623941785, + "learning_rate": 3.166178603705872e-07, + "loss": 0.3565, + "step": 6471 + }, + { + "epoch": 4.4882108183079055, + "grad_norm": 0.3774929025318759, + "learning_rate": 3.157706875040112e-07, + "loss": 0.3283, + "step": 6472 + }, + { + "epoch": 4.488904299583911, + "grad_norm": 0.532292283351115, + "learning_rate": 3.1492461258686044e-07, + "loss": 0.3479, + "step": 6473 + }, + { + "epoch": 4.489597780859917, + "grad_norm": 0.3983099552400063, + "learning_rate": 3.140796358174508e-07, + "loss": 0.3574, + "step": 6474 + }, + { + "epoch": 4.490291262135923, + "grad_norm": 0.408053838958733, + "learning_rate": 3.1323575739383716e-07, + "loss": 0.3367, + "step": 6475 + }, + { + "epoch": 4.490984743411928, + "grad_norm": 0.3979044170543787, + "learning_rate": 3.1239297751381845e-07, + "loss": 0.3448, + "step": 6476 + }, + { + "epoch": 4.491678224687933, + "grad_norm": 0.3911801713811084, + "learning_rate": 3.1155129637493733e-07, + "loss": 0.3465, + "step": 6477 + }, + { + "epoch": 4.492371705963939, + "grad_norm": 0.40382648272801225, + "learning_rate": 3.1071071417447587e-07, + "loss": 0.3447, + "step": 6478 + }, + { + "epoch": 4.493065187239944, + "grad_norm": 0.3910398820348786, + "learning_rate": 3.0987123110946204e-07, + "loss": 0.3576, + "step": 6479 + }, + { + "epoch": 4.49375866851595, + "grad_norm": 0.4587393510940895, + "learning_rate": 3.090328473766646e-07, + "loss": 0.3157, + "step": 6480 + }, + { + "epoch": 4.494452149791956, + "grad_norm": 0.43309360839548056, + "learning_rate": 3.0819556317259304e-07, + "loss": 0.3673, + "step": 6481 + }, + { + "epoch": 4.495145631067961, + "grad_norm": 0.4333632180477413, + "learning_rate": 3.073593786935031e-07, + "loss": 0.3578, + "step": 6482 + }, + { + "epoch": 4.495839112343967, + "grad_norm": 0.4287510088150104, + "learning_rate": 3.06524294135388e-07, + "loss": 0.3971, + "step": 6483 + }, + { + "epoch": 4.496532593619972, + "grad_norm": 0.428508304993015, + "learning_rate": 3.0569030969398726e-07, + "loss": 0.3538, + "step": 6484 + }, + { + "epoch": 4.497226074895978, + "grad_norm": 0.4713320414272633, + "learning_rate": 3.0485742556478073e-07, + "loss": 0.3815, + "step": 6485 + }, + { + "epoch": 4.497919556171984, + "grad_norm": 0.43818288531586524, + "learning_rate": 3.040256419429888e-07, + "loss": 0.3896, + "step": 6486 + }, + { + "epoch": 4.4986130374479885, + "grad_norm": 0.4094588046686549, + "learning_rate": 3.031949590235772e-07, + "loss": 0.3575, + "step": 6487 + }, + { + "epoch": 4.499306518723994, + "grad_norm": 0.4537010217842934, + "learning_rate": 3.023653770012508e-07, + "loss": 0.3778, + "step": 6488 + }, + { + "epoch": 4.5, + "grad_norm": 0.41914491753733574, + "learning_rate": 3.015368960704584e-07, + "loss": 0.3517, + "step": 6489 + }, + { + "epoch": 4.500693481276006, + "grad_norm": 0.4030602571777503, + "learning_rate": 3.0070951642538925e-07, + "loss": 0.3767, + "step": 6490 + }, + { + "epoch": 4.5013869625520115, + "grad_norm": 0.4218914131164033, + "learning_rate": 2.998832382599759e-07, + "loss": 0.3659, + "step": 6491 + }, + { + "epoch": 4.502080443828016, + "grad_norm": 0.40665886625530623, + "learning_rate": 2.990580617678923e-07, + "loss": 0.3709, + "step": 6492 + }, + { + "epoch": 4.502773925104022, + "grad_norm": 0.4221749105124615, + "learning_rate": 2.982339871425527e-07, + "loss": 0.3401, + "step": 6493 + }, + { + "epoch": 4.503467406380028, + "grad_norm": 0.42294670887561653, + "learning_rate": 2.974110145771142e-07, + "loss": 0.4071, + "step": 6494 + }, + { + "epoch": 4.504160887656033, + "grad_norm": 0.521710788793327, + "learning_rate": 2.965891442644775e-07, + "loss": 0.3478, + "step": 6495 + }, + { + "epoch": 4.504854368932039, + "grad_norm": 0.3807171060977504, + "learning_rate": 2.9576837639728073e-07, + "loss": 0.3382, + "step": 6496 + }, + { + "epoch": 4.505547850208044, + "grad_norm": 0.39005771180130033, + "learning_rate": 2.9494871116790667e-07, + "loss": 0.3724, + "step": 6497 + }, + { + "epoch": 4.50624133148405, + "grad_norm": 0.39533793319339844, + "learning_rate": 2.9413014876848e-07, + "loss": 0.3297, + "step": 6498 + }, + { + "epoch": 4.506934812760056, + "grad_norm": 0.4473857911757184, + "learning_rate": 2.9331268939086334e-07, + "loss": 0.3262, + "step": 6499 + }, + { + "epoch": 4.507628294036061, + "grad_norm": 0.3871276758241381, + "learning_rate": 2.924963332266667e-07, + "loss": 0.3793, + "step": 6500 + }, + { + "epoch": 4.508321775312067, + "grad_norm": 0.4422729774052628, + "learning_rate": 2.916810804672349e-07, + "loss": 0.3718, + "step": 6501 + }, + { + "epoch": 4.509015256588072, + "grad_norm": 0.39973229638445923, + "learning_rate": 2.908669313036588e-07, + "loss": 0.4085, + "step": 6502 + }, + { + "epoch": 4.509708737864077, + "grad_norm": 0.4118083586733117, + "learning_rate": 2.9005388592676987e-07, + "loss": 0.3277, + "step": 6503 + }, + { + "epoch": 4.510402219140083, + "grad_norm": 0.4167204896286219, + "learning_rate": 2.892419445271383e-07, + "loss": 0.3163, + "step": 6504 + }, + { + "epoch": 4.511095700416089, + "grad_norm": 0.4135384801167932, + "learning_rate": 2.8843110729507794e-07, + "loss": 0.3837, + "step": 6505 + }, + { + "epoch": 4.5117891816920945, + "grad_norm": 0.3758678191361516, + "learning_rate": 2.8762137442064353e-07, + "loss": 0.32, + "step": 6506 + }, + { + "epoch": 4.5124826629681, + "grad_norm": 0.4284753020111622, + "learning_rate": 2.868127460936304e-07, + "loss": 0.3666, + "step": 6507 + }, + { + "epoch": 4.513176144244105, + "grad_norm": 0.3606113740843206, + "learning_rate": 2.860052225035742e-07, + "loss": 0.2953, + "step": 6508 + }, + { + "epoch": 4.513869625520111, + "grad_norm": 0.4469565264864672, + "learning_rate": 2.8519880383975406e-07, + "loss": 0.3571, + "step": 6509 + }, + { + "epoch": 4.514563106796117, + "grad_norm": 0.3668758159511137, + "learning_rate": 2.8439349029118825e-07, + "loss": 0.3213, + "step": 6510 + }, + { + "epoch": 4.515256588072122, + "grad_norm": 0.4240413519433988, + "learning_rate": 2.8358928204663684e-07, + "loss": 0.3593, + "step": 6511 + }, + { + "epoch": 4.515950069348127, + "grad_norm": 0.44069596706920966, + "learning_rate": 2.827861792945991e-07, + "loss": 0.385, + "step": 6512 + }, + { + "epoch": 4.516643550624133, + "grad_norm": 0.7423024391069702, + "learning_rate": 2.8198418222331713e-07, + "loss": 0.3289, + "step": 6513 + }, + { + "epoch": 4.517337031900139, + "grad_norm": 0.41623711687393744, + "learning_rate": 2.81183291020774e-07, + "loss": 0.3513, + "step": 6514 + }, + { + "epoch": 4.518030513176145, + "grad_norm": 0.4517311584869714, + "learning_rate": 2.803835058746918e-07, + "loss": 0.3575, + "step": 6515 + }, + { + "epoch": 4.51872399445215, + "grad_norm": 0.39809335931351525, + "learning_rate": 2.7958482697253433e-07, + "loss": 0.3522, + "step": 6516 + }, + { + "epoch": 4.519417475728155, + "grad_norm": 0.4291542228527471, + "learning_rate": 2.787872545015069e-07, + "loss": 0.3589, + "step": 6517 + }, + { + "epoch": 4.520110957004161, + "grad_norm": 0.4150943789778223, + "learning_rate": 2.7799078864855446e-07, + "loss": 0.3654, + "step": 6518 + }, + { + "epoch": 4.520804438280166, + "grad_norm": 0.44335141616014906, + "learning_rate": 2.7719542960036315e-07, + "loss": 0.3644, + "step": 6519 + }, + { + "epoch": 4.521497919556172, + "grad_norm": 0.4348871010347055, + "learning_rate": 2.764011775433584e-07, + "loss": 0.3655, + "step": 6520 + }, + { + "epoch": 4.5221914008321775, + "grad_norm": 0.41287644806382484, + "learning_rate": 2.7560803266370783e-07, + "loss": 0.3192, + "step": 6521 + }, + { + "epoch": 4.522884882108183, + "grad_norm": 0.39974817155933356, + "learning_rate": 2.748159951473195e-07, + "loss": 0.3597, + "step": 6522 + }, + { + "epoch": 4.523578363384189, + "grad_norm": 0.4541361314750261, + "learning_rate": 2.7402506517983983e-07, + "loss": 0.3593, + "step": 6523 + }, + { + "epoch": 4.524271844660194, + "grad_norm": 0.40599335654716, + "learning_rate": 2.732352429466573e-07, + "loss": 0.3649, + "step": 6524 + }, + { + "epoch": 4.5249653259362, + "grad_norm": 0.6400772691369392, + "learning_rate": 2.72446528632902e-07, + "loss": 0.4258, + "step": 6525 + }, + { + "epoch": 4.5256588072122055, + "grad_norm": 0.38880545523469967, + "learning_rate": 2.716589224234406e-07, + "loss": 0.3443, + "step": 6526 + }, + { + "epoch": 4.52635228848821, + "grad_norm": 0.40296157361064605, + "learning_rate": 2.708724245028849e-07, + "loss": 0.3193, + "step": 6527 + }, + { + "epoch": 4.527045769764216, + "grad_norm": 0.6351020494405178, + "learning_rate": 2.700870350555823e-07, + "loss": 0.3589, + "step": 6528 + }, + { + "epoch": 4.527739251040222, + "grad_norm": 0.3661771491123025, + "learning_rate": 2.693027542656229e-07, + "loss": 0.3158, + "step": 6529 + }, + { + "epoch": 4.528432732316228, + "grad_norm": 0.4313492272871311, + "learning_rate": 2.6851958231683685e-07, + "loss": 0.3442, + "step": 6530 + }, + { + "epoch": 4.529126213592233, + "grad_norm": 0.43572220403651096, + "learning_rate": 2.677375193927939e-07, + "loss": 0.3544, + "step": 6531 + }, + { + "epoch": 4.529819694868238, + "grad_norm": 0.42791169698498455, + "learning_rate": 2.669565656768036e-07, + "loss": 0.3386, + "step": 6532 + }, + { + "epoch": 4.530513176144244, + "grad_norm": 0.38519609286662515, + "learning_rate": 2.6617672135191565e-07, + "loss": 0.3683, + "step": 6533 + }, + { + "epoch": 4.53120665742025, + "grad_norm": 0.4280170359083549, + "learning_rate": 2.653979866009204e-07, + "loss": 0.3529, + "step": 6534 + }, + { + "epoch": 4.531900138696255, + "grad_norm": 0.4068672641372711, + "learning_rate": 2.646203616063475e-07, + "loss": 0.3641, + "step": 6535 + }, + { + "epoch": 4.5325936199722605, + "grad_norm": 0.4271099957472437, + "learning_rate": 2.638438465504667e-07, + "loss": 0.3437, + "step": 6536 + }, + { + "epoch": 4.533287101248266, + "grad_norm": 0.4737638227447026, + "learning_rate": 2.630684416152879e-07, + "loss": 0.3624, + "step": 6537 + }, + { + "epoch": 4.533980582524272, + "grad_norm": 0.39051439935558413, + "learning_rate": 2.6229414698255907e-07, + "loss": 0.3164, + "step": 6538 + }, + { + "epoch": 4.534674063800278, + "grad_norm": 0.4568092945082574, + "learning_rate": 2.6152096283377e-07, + "loss": 0.3664, + "step": 6539 + }, + { + "epoch": 4.535367545076283, + "grad_norm": 0.3843181895475802, + "learning_rate": 2.6074888935015087e-07, + "loss": 0.3594, + "step": 6540 + }, + { + "epoch": 4.5360610263522885, + "grad_norm": 0.5258146706913907, + "learning_rate": 2.5997792671266787e-07, + "loss": 0.2818, + "step": 6541 + }, + { + "epoch": 4.536754507628294, + "grad_norm": 0.45660257694532436, + "learning_rate": 2.5920807510202984e-07, + "loss": 0.3425, + "step": 6542 + }, + { + "epoch": 4.537447988904299, + "grad_norm": 0.37549771371111773, + "learning_rate": 2.584393346986852e-07, + "loss": 0.3725, + "step": 6543 + }, + { + "epoch": 4.538141470180305, + "grad_norm": 0.4330040274361384, + "learning_rate": 2.576717056828193e-07, + "loss": 0.3778, + "step": 6544 + }, + { + "epoch": 4.538834951456311, + "grad_norm": 0.44130154909130553, + "learning_rate": 2.569051882343615e-07, + "loss": 0.3657, + "step": 6545 + }, + { + "epoch": 4.539528432732316, + "grad_norm": 0.39400618194672443, + "learning_rate": 2.5613978253297533e-07, + "loss": 0.3584, + "step": 6546 + }, + { + "epoch": 4.540221914008322, + "grad_norm": 0.4069553441613309, + "learning_rate": 2.5537548875806785e-07, + "loss": 0.3495, + "step": 6547 + }, + { + "epoch": 4.540915395284327, + "grad_norm": 0.424531468262278, + "learning_rate": 2.546123070887846e-07, + "loss": 0.3497, + "step": 6548 + }, + { + "epoch": 4.541608876560333, + "grad_norm": 0.4577295112311656, + "learning_rate": 2.5385023770400754e-07, + "loss": 0.4103, + "step": 6549 + }, + { + "epoch": 4.542302357836339, + "grad_norm": 0.4027887701619919, + "learning_rate": 2.5308928078236207e-07, + "loss": 0.3572, + "step": 6550 + }, + { + "epoch": 4.5429958391123435, + "grad_norm": 0.4721085563830496, + "learning_rate": 2.5232943650221055e-07, + "loss": 0.3633, + "step": 6551 + }, + { + "epoch": 4.543689320388349, + "grad_norm": 0.4459911633946896, + "learning_rate": 2.5157070504165495e-07, + "loss": 0.3305, + "step": 6552 + }, + { + "epoch": 4.544382801664355, + "grad_norm": 0.4302313580487091, + "learning_rate": 2.5081308657853576e-07, + "loss": 0.3637, + "step": 6553 + }, + { + "epoch": 4.545076282940361, + "grad_norm": 0.42841197764228633, + "learning_rate": 2.5005658129043377e-07, + "loss": 0.3545, + "step": 6554 + }, + { + "epoch": 4.545769764216367, + "grad_norm": 0.4160618786378027, + "learning_rate": 2.4930118935466875e-07, + "loss": 0.4077, + "step": 6555 + }, + { + "epoch": 4.5464632454923715, + "grad_norm": 0.40678581500629307, + "learning_rate": 2.4854691094829965e-07, + "loss": 0.3782, + "step": 6556 + }, + { + "epoch": 4.547156726768377, + "grad_norm": 0.6031029795388966, + "learning_rate": 2.477937462481217e-07, + "loss": 0.3398, + "step": 6557 + }, + { + "epoch": 4.547850208044383, + "grad_norm": 0.3924631494717983, + "learning_rate": 2.4704169543067314e-07, + "loss": 0.3172, + "step": 6558 + }, + { + "epoch": 4.548543689320388, + "grad_norm": 0.40269393212454174, + "learning_rate": 2.462907586722285e-07, + "loss": 0.3955, + "step": 6559 + }, + { + "epoch": 4.549237170596394, + "grad_norm": 0.4050210715802751, + "learning_rate": 2.4554093614880206e-07, + "loss": 0.3562, + "step": 6560 + }, + { + "epoch": 4.549930651872399, + "grad_norm": 0.4177639963500074, + "learning_rate": 2.4479222803614644e-07, + "loss": 0.3919, + "step": 6561 + }, + { + "epoch": 4.550624133148405, + "grad_norm": 0.4749450806249847, + "learning_rate": 2.4404463450975415e-07, + "loss": 0.325, + "step": 6562 + }, + { + "epoch": 4.551317614424411, + "grad_norm": 0.4351040183644717, + "learning_rate": 2.4329815574485493e-07, + "loss": 0.343, + "step": 6563 + }, + { + "epoch": 4.552011095700416, + "grad_norm": 0.3733244201879771, + "learning_rate": 2.425527919164195e-07, + "loss": 0.3062, + "step": 6564 + }, + { + "epoch": 4.552704576976422, + "grad_norm": 0.4396580819743459, + "learning_rate": 2.4180854319915346e-07, + "loss": 0.368, + "step": 6565 + }, + { + "epoch": 4.553398058252427, + "grad_norm": 0.4380716762686249, + "learning_rate": 2.410654097675041e-07, + "loss": 0.3859, + "step": 6566 + }, + { + "epoch": 4.554091539528432, + "grad_norm": 0.3668319890075246, + "learning_rate": 2.403233917956582e-07, + "loss": 0.3086, + "step": 6567 + }, + { + "epoch": 4.554785020804438, + "grad_norm": 0.3909884804602913, + "learning_rate": 2.3958248945753714e-07, + "loss": 0.3182, + "step": 6568 + }, + { + "epoch": 4.555478502080444, + "grad_norm": 0.3938518713301587, + "learning_rate": 2.3884270292680476e-07, + "loss": 0.3638, + "step": 6569 + }, + { + "epoch": 4.55617198335645, + "grad_norm": 0.3944873374706976, + "learning_rate": 2.381040323768602e-07, + "loss": 0.3288, + "step": 6570 + }, + { + "epoch": 4.556865464632455, + "grad_norm": 0.45370858756261456, + "learning_rate": 2.3736647798084268e-07, + "loss": 0.3611, + "step": 6571 + }, + { + "epoch": 4.55755894590846, + "grad_norm": 0.4639467918963087, + "learning_rate": 2.3663003991163113e-07, + "loss": 0.3637, + "step": 6572 + }, + { + "epoch": 4.558252427184466, + "grad_norm": 0.42693584690979414, + "learning_rate": 2.3589471834183975e-07, + "loss": 0.3728, + "step": 6573 + }, + { + "epoch": 4.558945908460472, + "grad_norm": 0.4206348896724024, + "learning_rate": 2.3516051344382285e-07, + "loss": 0.3571, + "step": 6574 + }, + { + "epoch": 4.559639389736477, + "grad_norm": 0.4161159497288621, + "learning_rate": 2.344274253896739e-07, + "loss": 0.358, + "step": 6575 + }, + { + "epoch": 4.560332871012482, + "grad_norm": 0.41348718852471833, + "learning_rate": 2.336954543512221e-07, + "loss": 0.3435, + "step": 6576 + }, + { + "epoch": 4.561026352288488, + "grad_norm": 0.4024688574700567, + "learning_rate": 2.3296460050003687e-07, + "loss": 0.3423, + "step": 6577 + }, + { + "epoch": 4.561719833564494, + "grad_norm": 0.43723687127994926, + "learning_rate": 2.3223486400742456e-07, + "loss": 0.3643, + "step": 6578 + }, + { + "epoch": 4.5624133148405, + "grad_norm": 0.4480853856166621, + "learning_rate": 2.3150624504442997e-07, + "loss": 0.3314, + "step": 6579 + }, + { + "epoch": 4.563106796116505, + "grad_norm": 0.37850608175012385, + "learning_rate": 2.307787437818365e-07, + "loss": 0.3154, + "step": 6580 + }, + { + "epoch": 4.56380027739251, + "grad_norm": 0.41017433062649455, + "learning_rate": 2.3005236039016554e-07, + "loss": 0.333, + "step": 6581 + }, + { + "epoch": 4.564493758668516, + "grad_norm": 0.44818421710488887, + "learning_rate": 2.2932709503967587e-07, + "loss": 0.3372, + "step": 6582 + }, + { + "epoch": 4.565187239944521, + "grad_norm": 0.4241577498244145, + "learning_rate": 2.286029479003643e-07, + "loss": 0.3616, + "step": 6583 + }, + { + "epoch": 4.565880721220527, + "grad_norm": 0.46869856563719514, + "learning_rate": 2.2787991914196505e-07, + "loss": 0.359, + "step": 6584 + }, + { + "epoch": 4.566574202496533, + "grad_norm": 0.3967347604826857, + "learning_rate": 2.2715800893395256e-07, + "loss": 0.3398, + "step": 6585 + }, + { + "epoch": 4.567267683772538, + "grad_norm": 0.42530800727361445, + "learning_rate": 2.2643721744553483e-07, + "loss": 0.3376, + "step": 6586 + }, + { + "epoch": 4.567961165048544, + "grad_norm": 0.41069185069357356, + "learning_rate": 2.257175448456622e-07, + "loss": 0.3497, + "step": 6587 + }, + { + "epoch": 4.568654646324549, + "grad_norm": 0.4001560346445005, + "learning_rate": 2.2499899130301983e-07, + "loss": 0.3004, + "step": 6588 + }, + { + "epoch": 4.569348127600555, + "grad_norm": 0.43127553399784263, + "learning_rate": 2.2428155698603182e-07, + "loss": 0.3394, + "step": 6589 + }, + { + "epoch": 4.5700416088765605, + "grad_norm": 0.44513476052202566, + "learning_rate": 2.2356524206286033e-07, + "loss": 0.344, + "step": 6590 + }, + { + "epoch": 4.570735090152565, + "grad_norm": 0.5265858974041646, + "learning_rate": 2.2285004670140275e-07, + "loss": 0.3996, + "step": 6591 + }, + { + "epoch": 4.571428571428571, + "grad_norm": 0.41331666692849756, + "learning_rate": 2.2213597106929608e-07, + "loss": 0.3445, + "step": 6592 + }, + { + "epoch": 4.572122052704577, + "grad_norm": 0.38689482305681333, + "learning_rate": 2.2142301533391586e-07, + "loss": 0.3499, + "step": 6593 + }, + { + "epoch": 4.572815533980583, + "grad_norm": 0.4151646109190327, + "learning_rate": 2.207111796623723e-07, + "loss": 0.346, + "step": 6594 + }, + { + "epoch": 4.5735090152565885, + "grad_norm": 0.38944932861630477, + "learning_rate": 2.2000046422151479e-07, + "loss": 0.3177, + "step": 6595 + }, + { + "epoch": 4.574202496532593, + "grad_norm": 0.3929990605150274, + "learning_rate": 2.1929086917793052e-07, + "loss": 0.3298, + "step": 6596 + }, + { + "epoch": 4.574895977808599, + "grad_norm": 0.382487539641259, + "learning_rate": 2.1858239469794206e-07, + "loss": 0.3374, + "step": 6597 + }, + { + "epoch": 4.575589459084605, + "grad_norm": 0.3743567709313739, + "learning_rate": 2.1787504094761268e-07, + "loss": 0.3351, + "step": 6598 + }, + { + "epoch": 4.57628294036061, + "grad_norm": 0.4383388425995304, + "learning_rate": 2.1716880809273978e-07, + "loss": 0.3343, + "step": 6599 + }, + { + "epoch": 4.5769764216366156, + "grad_norm": 0.5747238472588304, + "learning_rate": 2.1646369629885867e-07, + "loss": 0.3493, + "step": 6600 + }, + { + "epoch": 4.577669902912621, + "grad_norm": 0.39377261634329735, + "learning_rate": 2.1575970573124437e-07, + "loss": 0.3189, + "step": 6601 + }, + { + "epoch": 4.578363384188627, + "grad_norm": 0.5130892949484, + "learning_rate": 2.1505683655490495e-07, + "loss": 0.3356, + "step": 6602 + }, + { + "epoch": 4.579056865464633, + "grad_norm": 0.3909262813168535, + "learning_rate": 2.1435508893458912e-07, + "loss": 0.3621, + "step": 6603 + }, + { + "epoch": 4.579750346740638, + "grad_norm": 0.5019565370247855, + "learning_rate": 2.1365446303478142e-07, + "loss": 0.3943, + "step": 6604 + }, + { + "epoch": 4.5804438280166435, + "grad_norm": 0.41810129940267743, + "learning_rate": 2.1295495901970275e-07, + "loss": 0.3392, + "step": 6605 + }, + { + "epoch": 4.581137309292649, + "grad_norm": 0.4398827655216951, + "learning_rate": 2.1225657705331249e-07, + "loss": 0.3403, + "step": 6606 + }, + { + "epoch": 4.581830790568654, + "grad_norm": 0.4091014458505974, + "learning_rate": 2.115593172993058e-07, + "loss": 0.3645, + "step": 6607 + }, + { + "epoch": 4.58252427184466, + "grad_norm": 0.43415775581988225, + "learning_rate": 2.108631799211158e-07, + "loss": 0.3434, + "step": 6608 + }, + { + "epoch": 4.583217753120666, + "grad_norm": 0.4739394088706493, + "learning_rate": 2.1016816508191263e-07, + "loss": 0.3534, + "step": 6609 + }, + { + "epoch": 4.5839112343966715, + "grad_norm": 0.4090274718583703, + "learning_rate": 2.0947427294460142e-07, + "loss": 0.3432, + "step": 6610 + }, + { + "epoch": 4.584604715672677, + "grad_norm": 0.39989999743785615, + "learning_rate": 2.0878150367182547e-07, + "loss": 0.3518, + "step": 6611 + }, + { + "epoch": 4.585298196948682, + "grad_norm": 0.4548783797876923, + "learning_rate": 2.0808985742596653e-07, + "loss": 0.4076, + "step": 6612 + }, + { + "epoch": 4.585991678224688, + "grad_norm": 0.44472465670113426, + "learning_rate": 2.073993343691394e-07, + "loss": 0.3935, + "step": 6613 + }, + { + "epoch": 4.586685159500694, + "grad_norm": 0.4368895525570366, + "learning_rate": 2.0670993466319956e-07, + "loss": 0.3546, + "step": 6614 + }, + { + "epoch": 4.5873786407766985, + "grad_norm": 0.4145043639499307, + "learning_rate": 2.0602165846973498e-07, + "loss": 0.3394, + "step": 6615 + }, + { + "epoch": 4.588072122052704, + "grad_norm": 0.539968477580173, + "learning_rate": 2.05334505950075e-07, + "loss": 0.3364, + "step": 6616 + }, + { + "epoch": 4.58876560332871, + "grad_norm": 0.40832294925657137, + "learning_rate": 2.0464847726528236e-07, + "loss": 0.3594, + "step": 6617 + }, + { + "epoch": 4.589459084604716, + "grad_norm": 0.7232419628694357, + "learning_rate": 2.0396357257615684e-07, + "loss": 0.332, + "step": 6618 + }, + { + "epoch": 4.590152565880722, + "grad_norm": 0.4157838669127496, + "learning_rate": 2.0327979204323557e-07, + "loss": 0.3588, + "step": 6619 + }, + { + "epoch": 4.5908460471567265, + "grad_norm": 0.39598205637842165, + "learning_rate": 2.025971358267914e-07, + "loss": 0.3884, + "step": 6620 + }, + { + "epoch": 4.591539528432732, + "grad_norm": 0.41932620594896597, + "learning_rate": 2.019156040868342e-07, + "loss": 0.3255, + "step": 6621 + }, + { + "epoch": 4.592233009708738, + "grad_norm": 0.40689397698464574, + "learning_rate": 2.0123519698311e-07, + "loss": 0.3279, + "step": 6622 + }, + { + "epoch": 4.592926490984743, + "grad_norm": 0.4255000799197367, + "learning_rate": 2.0055591467510126e-07, + "loss": 0.3406, + "step": 6623 + }, + { + "epoch": 4.593619972260749, + "grad_norm": 0.4453047957874876, + "learning_rate": 1.9987775732202618e-07, + "loss": 0.3622, + "step": 6624 + }, + { + "epoch": 4.5943134535367545, + "grad_norm": 0.36916286051210967, + "learning_rate": 1.9920072508284204e-07, + "loss": 0.3305, + "step": 6625 + }, + { + "epoch": 4.59500693481276, + "grad_norm": 0.42935231157805465, + "learning_rate": 1.9852481811623803e-07, + "loss": 0.3315, + "step": 6626 + }, + { + "epoch": 4.595700416088766, + "grad_norm": 0.44283573166986173, + "learning_rate": 1.978500365806435e-07, + "loss": 0.3805, + "step": 6627 + }, + { + "epoch": 4.596393897364771, + "grad_norm": 0.40192658742637344, + "learning_rate": 1.971763806342214e-07, + "loss": 0.3055, + "step": 6628 + }, + { + "epoch": 4.597087378640777, + "grad_norm": 0.4129234107909708, + "learning_rate": 1.9650385043487152e-07, + "loss": 0.3271, + "step": 6629 + }, + { + "epoch": 4.597780859916782, + "grad_norm": 0.4426665403270593, + "learning_rate": 1.958324461402311e-07, + "loss": 0.3738, + "step": 6630 + }, + { + "epoch": 4.598474341192787, + "grad_norm": 0.41428201391912783, + "learning_rate": 1.9516216790767151e-07, + "loss": 0.3237, + "step": 6631 + }, + { + "epoch": 4.599167822468793, + "grad_norm": 0.41145024535793706, + "learning_rate": 1.9449301589430148e-07, + "loss": 0.3404, + "step": 6632 + }, + { + "epoch": 4.599861303744799, + "grad_norm": 0.4152177258658786, + "learning_rate": 1.93824990256965e-07, + "loss": 0.3437, + "step": 6633 + }, + { + "epoch": 4.600554785020805, + "grad_norm": 0.4342061013767659, + "learning_rate": 1.9315809115224348e-07, + "loss": 0.4081, + "step": 6634 + }, + { + "epoch": 4.60124826629681, + "grad_norm": 0.44760226540203973, + "learning_rate": 1.9249231873645247e-07, + "loss": 0.3799, + "step": 6635 + }, + { + "epoch": 4.601941747572815, + "grad_norm": 0.42085826953626726, + "learning_rate": 1.9182767316564433e-07, + "loss": 0.3529, + "step": 6636 + }, + { + "epoch": 4.602635228848821, + "grad_norm": 0.4108062638522855, + "learning_rate": 1.911641545956072e-07, + "loss": 0.3581, + "step": 6637 + }, + { + "epoch": 4.603328710124827, + "grad_norm": 0.4176745703913112, + "learning_rate": 1.9050176318186508e-07, + "loss": 0.3794, + "step": 6638 + }, + { + "epoch": 4.604022191400832, + "grad_norm": 0.3975265994014037, + "learning_rate": 1.898404990796776e-07, + "loss": 0.3609, + "step": 6639 + }, + { + "epoch": 4.6047156726768375, + "grad_norm": 0.47061066044974603, + "learning_rate": 1.8918036244404026e-07, + "loss": 0.3352, + "step": 6640 + }, + { + "epoch": 4.605409153952843, + "grad_norm": 0.4358234354241011, + "learning_rate": 1.8852135342968481e-07, + "loss": 0.3957, + "step": 6641 + }, + { + "epoch": 4.606102635228849, + "grad_norm": 0.3730065338551913, + "learning_rate": 1.878634721910766e-07, + "loss": 0.324, + "step": 6642 + }, + { + "epoch": 4.606796116504855, + "grad_norm": 0.42398317259038704, + "learning_rate": 1.8720671888242058e-07, + "loss": 0.3847, + "step": 6643 + }, + { + "epoch": 4.60748959778086, + "grad_norm": 0.4359676935233792, + "learning_rate": 1.8655109365765312e-07, + "loss": 0.3513, + "step": 6644 + }, + { + "epoch": 4.608183079056865, + "grad_norm": 0.41843473412501964, + "learning_rate": 1.8589659667044847e-07, + "loss": 0.3487, + "step": 6645 + }, + { + "epoch": 4.608876560332871, + "grad_norm": 0.3955520622447244, + "learning_rate": 1.8524322807421723e-07, + "loss": 0.3845, + "step": 6646 + }, + { + "epoch": 4.609570041608876, + "grad_norm": 0.4237314241468602, + "learning_rate": 1.845909880221025e-07, + "loss": 0.3566, + "step": 6647 + }, + { + "epoch": 4.610263522884882, + "grad_norm": 0.432610738349929, + "learning_rate": 1.839398766669853e-07, + "loss": 0.3643, + "step": 6648 + }, + { + "epoch": 4.610957004160888, + "grad_norm": 0.3690249344272704, + "learning_rate": 1.8328989416148192e-07, + "loss": 0.3004, + "step": 6649 + }, + { + "epoch": 4.611650485436893, + "grad_norm": 0.4143862370385774, + "learning_rate": 1.8264104065794265e-07, + "loss": 0.3522, + "step": 6650 + }, + { + "epoch": 4.612343966712899, + "grad_norm": 0.4209448083559079, + "learning_rate": 1.8199331630845418e-07, + "loss": 0.3489, + "step": 6651 + }, + { + "epoch": 4.613037447988904, + "grad_norm": 0.43925667239934624, + "learning_rate": 1.81346721264839e-07, + "loss": 0.3654, + "step": 6652 + }, + { + "epoch": 4.61373092926491, + "grad_norm": 0.8191495949140846, + "learning_rate": 1.8070125567865415e-07, + "loss": 0.4005, + "step": 6653 + }, + { + "epoch": 4.614424410540916, + "grad_norm": 0.3921487423882409, + "learning_rate": 1.8005691970119254e-07, + "loss": 0.3195, + "step": 6654 + }, + { + "epoch": 4.6151178918169204, + "grad_norm": 0.43231313283189815, + "learning_rate": 1.7941371348348057e-07, + "loss": 0.3714, + "step": 6655 + }, + { + "epoch": 4.615811373092926, + "grad_norm": 0.41016272036284057, + "learning_rate": 1.7877163717628155e-07, + "loss": 0.3184, + "step": 6656 + }, + { + "epoch": 4.616504854368932, + "grad_norm": 0.45460623159582375, + "learning_rate": 1.781306909300945e-07, + "loss": 0.3667, + "step": 6657 + }, + { + "epoch": 4.617198335644938, + "grad_norm": 0.40401495288836997, + "learning_rate": 1.774908748951515e-07, + "loss": 0.3208, + "step": 6658 + }, + { + "epoch": 4.6178918169209435, + "grad_norm": 0.3728093803124587, + "learning_rate": 1.76852189221422e-07, + "loss": 0.3304, + "step": 6659 + }, + { + "epoch": 4.618585298196948, + "grad_norm": 0.4212808230587952, + "learning_rate": 1.7621463405860683e-07, + "loss": 0.3094, + "step": 6660 + }, + { + "epoch": 4.619278779472954, + "grad_norm": 0.41247273143391566, + "learning_rate": 1.75578209556147e-07, + "loss": 0.3401, + "step": 6661 + }, + { + "epoch": 4.61997226074896, + "grad_norm": 0.4151187889067134, + "learning_rate": 1.749429158632149e-07, + "loss": 0.3125, + "step": 6662 + }, + { + "epoch": 4.620665742024965, + "grad_norm": 0.42407205762886613, + "learning_rate": 1.7430875312871797e-07, + "loss": 0.3729, + "step": 6663 + }, + { + "epoch": 4.621359223300971, + "grad_norm": 0.391440849609123, + "learning_rate": 1.736757215013013e-07, + "loss": 0.3667, + "step": 6664 + }, + { + "epoch": 4.622052704576976, + "grad_norm": 0.38587659761782256, + "learning_rate": 1.730438211293406e-07, + "loss": 0.3173, + "step": 6665 + }, + { + "epoch": 4.622746185852982, + "grad_norm": 0.4384276094034426, + "learning_rate": 1.724130521609496e-07, + "loss": 0.3485, + "step": 6666 + }, + { + "epoch": 4.623439667128988, + "grad_norm": 0.38801601832069726, + "learning_rate": 1.7178341474397674e-07, + "loss": 0.3477, + "step": 6667 + }, + { + "epoch": 4.624133148404993, + "grad_norm": 0.4183163085913416, + "learning_rate": 1.711549090260034e-07, + "loss": 0.3572, + "step": 6668 + }, + { + "epoch": 4.624826629680999, + "grad_norm": 0.4222211756985973, + "learning_rate": 1.7052753515434728e-07, + "loss": 0.3689, + "step": 6669 + }, + { + "epoch": 4.625520110957004, + "grad_norm": 0.3899032292209465, + "learning_rate": 1.699012932760602e-07, + "loss": 0.3687, + "step": 6670 + }, + { + "epoch": 4.62621359223301, + "grad_norm": 0.40036377433853393, + "learning_rate": 1.6927618353792862e-07, + "loss": 0.3614, + "step": 6671 + }, + { + "epoch": 4.626907073509015, + "grad_norm": 0.4141272469217083, + "learning_rate": 1.686522060864748e-07, + "loss": 0.3874, + "step": 6672 + }, + { + "epoch": 4.627600554785021, + "grad_norm": 0.39555092439102046, + "learning_rate": 1.6802936106795286e-07, + "loss": 0.3438, + "step": 6673 + }, + { + "epoch": 4.6282940360610265, + "grad_norm": 0.3879885169859168, + "learning_rate": 1.674076486283538e-07, + "loss": 0.3751, + "step": 6674 + }, + { + "epoch": 4.628987517337032, + "grad_norm": 0.5668088213465785, + "learning_rate": 1.6678706891340325e-07, + "loss": 0.3231, + "step": 6675 + }, + { + "epoch": 4.629680998613037, + "grad_norm": 0.3744846867711921, + "learning_rate": 1.6616762206855928e-07, + "loss": 0.3493, + "step": 6676 + }, + { + "epoch": 4.630374479889043, + "grad_norm": 0.3684355729031906, + "learning_rate": 1.6554930823901695e-07, + "loss": 0.3112, + "step": 6677 + }, + { + "epoch": 4.631067961165049, + "grad_norm": 0.40771883102611195, + "learning_rate": 1.6493212756970356e-07, + "loss": 0.3379, + "step": 6678 + }, + { + "epoch": 4.6317614424410545, + "grad_norm": 0.46091316799899035, + "learning_rate": 1.6431608020528233e-07, + "loss": 0.282, + "step": 6679 + }, + { + "epoch": 4.632454923717059, + "grad_norm": 0.4169117708550667, + "learning_rate": 1.637011662901511e-07, + "loss": 0.3756, + "step": 6680 + }, + { + "epoch": 4.633148404993065, + "grad_norm": 0.4194509220578528, + "learning_rate": 1.6308738596843953e-07, + "loss": 0.3617, + "step": 6681 + }, + { + "epoch": 4.633841886269071, + "grad_norm": 0.43122335718609295, + "learning_rate": 1.6247473938401426e-07, + "loss": 0.4036, + "step": 6682 + }, + { + "epoch": 4.634535367545077, + "grad_norm": 0.3886736164761595, + "learning_rate": 1.6186322668047538e-07, + "loss": 0.3588, + "step": 6683 + }, + { + "epoch": 4.6352288488210815, + "grad_norm": 0.435130127048327, + "learning_rate": 1.6125284800115604e-07, + "loss": 0.357, + "step": 6684 + }, + { + "epoch": 4.635922330097087, + "grad_norm": 0.42392845633829646, + "learning_rate": 1.6064360348912567e-07, + "loss": 0.4047, + "step": 6685 + }, + { + "epoch": 4.636615811373093, + "grad_norm": 0.37841049734660087, + "learning_rate": 1.600354932871867e-07, + "loss": 0.3347, + "step": 6686 + }, + { + "epoch": 4.637309292649099, + "grad_norm": 0.46564699368728796, + "learning_rate": 1.594285175378746e-07, + "loss": 0.3554, + "step": 6687 + }, + { + "epoch": 4.638002773925104, + "grad_norm": 0.3978575411021788, + "learning_rate": 1.5882267638346217e-07, + "loss": 0.3124, + "step": 6688 + }, + { + "epoch": 4.6386962552011095, + "grad_norm": 0.38198941755229715, + "learning_rate": 1.5821796996595197e-07, + "loss": 0.2977, + "step": 6689 + }, + { + "epoch": 4.639389736477115, + "grad_norm": 0.39947531859126206, + "learning_rate": 1.5761439842708392e-07, + "loss": 0.3441, + "step": 6690 + }, + { + "epoch": 4.640083217753121, + "grad_norm": 0.37601127611692114, + "learning_rate": 1.5701196190833102e-07, + "loss": 0.3134, + "step": 6691 + }, + { + "epoch": 4.640776699029126, + "grad_norm": 0.44854689077246596, + "learning_rate": 1.5641066055089916e-07, + "loss": 0.3401, + "step": 6692 + }, + { + "epoch": 4.641470180305132, + "grad_norm": 0.3968189250149672, + "learning_rate": 1.5581049449573004e-07, + "loss": 0.3388, + "step": 6693 + }, + { + "epoch": 4.6421636615811375, + "grad_norm": 0.4107917651039533, + "learning_rate": 1.5521146388349783e-07, + "loss": 0.359, + "step": 6694 + }, + { + "epoch": 4.642857142857143, + "grad_norm": 0.3896473693015918, + "learning_rate": 1.5461356885461077e-07, + "loss": 0.3325, + "step": 6695 + }, + { + "epoch": 4.643550624133148, + "grad_norm": 0.40495332063411876, + "learning_rate": 1.5401680954921062e-07, + "loss": 0.3571, + "step": 6696 + }, + { + "epoch": 4.644244105409154, + "grad_norm": 0.4103280332373116, + "learning_rate": 1.5342118610717438e-07, + "loss": 0.342, + "step": 6697 + }, + { + "epoch": 4.64493758668516, + "grad_norm": 0.46280859618808157, + "learning_rate": 1.5282669866811152e-07, + "loss": 0.3803, + "step": 6698 + }, + { + "epoch": 4.645631067961165, + "grad_norm": 0.41744347843303314, + "learning_rate": 1.5223334737136608e-07, + "loss": 0.3635, + "step": 6699 + }, + { + "epoch": 4.64632454923717, + "grad_norm": 0.406194187007945, + "learning_rate": 1.5164113235601462e-07, + "loss": 0.3422, + "step": 6700 + }, + { + "epoch": 4.647018030513176, + "grad_norm": 0.46002313056312844, + "learning_rate": 1.5105005376086778e-07, + "loss": 0.4384, + "step": 6701 + }, + { + "epoch": 4.647711511789182, + "grad_norm": 0.37639387252560136, + "learning_rate": 1.504601117244714e-07, + "loss": 0.3158, + "step": 6702 + }, + { + "epoch": 4.648404993065188, + "grad_norm": 0.39863091718502724, + "learning_rate": 1.498713063851026e-07, + "loss": 0.3365, + "step": 6703 + }, + { + "epoch": 4.6490984743411925, + "grad_norm": 0.4018608400880308, + "learning_rate": 1.4928363788077327e-07, + "loss": 0.3631, + "step": 6704 + }, + { + "epoch": 4.649791955617198, + "grad_norm": 0.4104518088407998, + "learning_rate": 1.4869710634922762e-07, + "loss": 0.3607, + "step": 6705 + }, + { + "epoch": 4.650485436893204, + "grad_norm": 0.4534698134523714, + "learning_rate": 1.4811171192794628e-07, + "loss": 0.3381, + "step": 6706 + }, + { + "epoch": 4.65117891816921, + "grad_norm": 0.4227278924218095, + "learning_rate": 1.475274547541411e-07, + "loss": 0.3677, + "step": 6707 + }, + { + "epoch": 4.651872399445215, + "grad_norm": 0.5608003809455238, + "learning_rate": 1.4694433496475702e-07, + "loss": 0.4047, + "step": 6708 + }, + { + "epoch": 4.6525658807212205, + "grad_norm": 0.39603586592941703, + "learning_rate": 1.4636235269647359e-07, + "loss": 0.3224, + "step": 6709 + }, + { + "epoch": 4.653259361997226, + "grad_norm": 0.41740625091370726, + "learning_rate": 1.4578150808570223e-07, + "loss": 0.3437, + "step": 6710 + }, + { + "epoch": 4.653952843273232, + "grad_norm": 0.43748035655726614, + "learning_rate": 1.4520180126859018e-07, + "loss": 0.3691, + "step": 6711 + }, + { + "epoch": 4.654646324549237, + "grad_norm": 0.4032783662769279, + "learning_rate": 1.4462323238101538e-07, + "loss": 0.3647, + "step": 6712 + }, + { + "epoch": 4.655339805825243, + "grad_norm": 0.36993848544011976, + "learning_rate": 1.4404580155859106e-07, + "loss": 0.3179, + "step": 6713 + }, + { + "epoch": 4.656033287101248, + "grad_norm": 0.41115462936777036, + "learning_rate": 1.4346950893666167e-07, + "loss": 0.3364, + "step": 6714 + }, + { + "epoch": 4.656726768377254, + "grad_norm": 0.4363038939442896, + "learning_rate": 1.42894354650307e-07, + "loss": 0.3194, + "step": 6715 + }, + { + "epoch": 4.657420249653259, + "grad_norm": 0.4485323098363289, + "learning_rate": 1.423203388343386e-07, + "loss": 0.3088, + "step": 6716 + }, + { + "epoch": 4.658113730929265, + "grad_norm": 0.4196024765984699, + "learning_rate": 1.4174746162330278e-07, + "loss": 0.4428, + "step": 6717 + }, + { + "epoch": 4.658807212205271, + "grad_norm": 0.4177274097626942, + "learning_rate": 1.4117572315147598e-07, + "loss": 0.3495, + "step": 6718 + }, + { + "epoch": 4.659500693481276, + "grad_norm": 0.45901509451905975, + "learning_rate": 1.4060512355287048e-07, + "loss": 0.3592, + "step": 6719 + }, + { + "epoch": 4.660194174757281, + "grad_norm": 0.4740244898274189, + "learning_rate": 1.4003566296123095e-07, + "loss": 0.3887, + "step": 6720 + }, + { + "epoch": 4.660887656033287, + "grad_norm": 0.38106305206428936, + "learning_rate": 1.394673415100345e-07, + "loss": 0.3494, + "step": 6721 + }, + { + "epoch": 4.661581137309293, + "grad_norm": 0.4544181776548709, + "learning_rate": 1.3890015933249124e-07, + "loss": 0.2719, + "step": 6722 + }, + { + "epoch": 4.662274618585299, + "grad_norm": 0.4030455780376685, + "learning_rate": 1.3833411656154483e-07, + "loss": 0.2863, + "step": 6723 + }, + { + "epoch": 4.6629680998613035, + "grad_norm": 0.37873720359007057, + "learning_rate": 1.3776921332987193e-07, + "loss": 0.3341, + "step": 6724 + }, + { + "epoch": 4.663661581137309, + "grad_norm": 0.4580870905083957, + "learning_rate": 1.372054497698816e-07, + "loss": 0.353, + "step": 6725 + }, + { + "epoch": 4.664355062413315, + "grad_norm": 0.4001016690633297, + "learning_rate": 1.3664282601371536e-07, + "loss": 0.3357, + "step": 6726 + }, + { + "epoch": 4.665048543689321, + "grad_norm": 0.4168440857226407, + "learning_rate": 1.3608134219324886e-07, + "loss": 0.3642, + "step": 6727 + }, + { + "epoch": 4.665742024965326, + "grad_norm": 0.4165023500950493, + "learning_rate": 1.3552099844009013e-07, + "loss": 0.3203, + "step": 6728 + }, + { + "epoch": 4.666435506241331, + "grad_norm": 0.3893905445083926, + "learning_rate": 1.349617948855786e-07, + "loss": 0.3429, + "step": 6729 + }, + { + "epoch": 4.667128987517337, + "grad_norm": 0.4335745681172854, + "learning_rate": 1.3440373166078824e-07, + "loss": 0.3795, + "step": 6730 + }, + { + "epoch": 4.667822468793343, + "grad_norm": 0.4708287411816723, + "learning_rate": 1.3384680889652502e-07, + "loss": 0.3387, + "step": 6731 + }, + { + "epoch": 4.668515950069348, + "grad_norm": 0.4292906387974233, + "learning_rate": 1.332910267233267e-07, + "loss": 0.3435, + "step": 6732 + }, + { + "epoch": 4.669209431345354, + "grad_norm": 0.4967563615936282, + "learning_rate": 1.3273638527146638e-07, + "loss": 0.3353, + "step": 6733 + }, + { + "epoch": 4.669902912621359, + "grad_norm": 0.3737189666638068, + "learning_rate": 1.321828846709461e-07, + "loss": 0.3506, + "step": 6734 + }, + { + "epoch": 4.670596393897365, + "grad_norm": 0.4240505954098639, + "learning_rate": 1.3163052505150375e-07, + "loss": 0.3633, + "step": 6735 + }, + { + "epoch": 4.67128987517337, + "grad_norm": 0.39289518288564, + "learning_rate": 1.3107930654260804e-07, + "loss": 0.3536, + "step": 6736 + }, + { + "epoch": 4.671983356449376, + "grad_norm": 0.4567296836979554, + "learning_rate": 1.3052922927346e-07, + "loss": 0.3513, + "step": 6737 + }, + { + "epoch": 4.672676837725382, + "grad_norm": 0.38757277985861877, + "learning_rate": 1.2998029337299433e-07, + "loss": 0.3395, + "step": 6738 + }, + { + "epoch": 4.673370319001387, + "grad_norm": 0.39599690112778674, + "learning_rate": 1.2943249896987864e-07, + "loss": 0.3288, + "step": 6739 + }, + { + "epoch": 4.674063800277392, + "grad_norm": 0.42239428336219415, + "learning_rate": 1.2888584619250966e-07, + "loss": 0.3737, + "step": 6740 + }, + { + "epoch": 4.674757281553398, + "grad_norm": 0.3857297733744965, + "learning_rate": 1.2834033516902044e-07, + "loss": 0.323, + "step": 6741 + }, + { + "epoch": 4.675450762829404, + "grad_norm": 0.967508623031919, + "learning_rate": 1.277959660272743e-07, + "loss": 0.3179, + "step": 6742 + }, + { + "epoch": 4.6761442441054095, + "grad_norm": 0.4682470535663424, + "learning_rate": 1.2725273889486745e-07, + "loss": 0.3635, + "step": 6743 + }, + { + "epoch": 4.676837725381414, + "grad_norm": 0.43960960852529, + "learning_rate": 1.2671065389912917e-07, + "loss": 0.3914, + "step": 6744 + }, + { + "epoch": 4.67753120665742, + "grad_norm": 0.5233143144705975, + "learning_rate": 1.2616971116711895e-07, + "loss": 0.3403, + "step": 6745 + }, + { + "epoch": 4.678224687933426, + "grad_norm": 0.4100101852721493, + "learning_rate": 1.2562991082563092e-07, + "loss": 0.3233, + "step": 6746 + }, + { + "epoch": 4.678918169209432, + "grad_norm": 0.4484855491389793, + "learning_rate": 1.2509125300118996e-07, + "loss": 0.3245, + "step": 6747 + }, + { + "epoch": 4.679611650485437, + "grad_norm": 0.38710294017866376, + "learning_rate": 1.2455373782005343e-07, + "loss": 0.3686, + "step": 6748 + }, + { + "epoch": 4.680305131761442, + "grad_norm": 0.45062554431993884, + "learning_rate": 1.2401736540821108e-07, + "loss": 0.3501, + "step": 6749 + }, + { + "epoch": 4.680998613037448, + "grad_norm": 0.38733770690691943, + "learning_rate": 1.2348213589138402e-07, + "loss": 0.3448, + "step": 6750 + }, + { + "epoch": 4.681692094313454, + "grad_norm": 0.42319529577596543, + "learning_rate": 1.2294804939502746e-07, + "loss": 0.3546, + "step": 6751 + }, + { + "epoch": 4.682385575589459, + "grad_norm": 0.5086293836259173, + "learning_rate": 1.224151060443274e-07, + "loss": 0.412, + "step": 6752 + }, + { + "epoch": 4.6830790568654646, + "grad_norm": 0.5485209236035724, + "learning_rate": 1.2188330596420106e-07, + "loss": 0.3551, + "step": 6753 + }, + { + "epoch": 4.68377253814147, + "grad_norm": 0.4102678308013505, + "learning_rate": 1.213526492792988e-07, + "loss": 0.3717, + "step": 6754 + }, + { + "epoch": 4.684466019417476, + "grad_norm": 0.47525191378298826, + "learning_rate": 1.2082313611400276e-07, + "loss": 0.3928, + "step": 6755 + }, + { + "epoch": 4.685159500693481, + "grad_norm": 0.4439277804978099, + "learning_rate": 1.2029476659242644e-07, + "loss": 0.3505, + "step": 6756 + }, + { + "epoch": 4.685852981969487, + "grad_norm": 0.4132820277534149, + "learning_rate": 1.1976754083841747e-07, + "loss": 0.3638, + "step": 6757 + }, + { + "epoch": 4.6865464632454925, + "grad_norm": 0.39405119535260347, + "learning_rate": 1.192414589755514e-07, + "loss": 0.3481, + "step": 6758 + }, + { + "epoch": 4.687239944521498, + "grad_norm": 0.4860560758389836, + "learning_rate": 1.187165211271396e-07, + "loss": 0.3706, + "step": 6759 + }, + { + "epoch": 4.687933425797503, + "grad_norm": 0.4061770014012126, + "learning_rate": 1.1819272741622367e-07, + "loss": 0.3335, + "step": 6760 + }, + { + "epoch": 4.688626907073509, + "grad_norm": 0.40806900505269417, + "learning_rate": 1.176700779655765e-07, + "loss": 0.3001, + "step": 6761 + }, + { + "epoch": 4.689320388349515, + "grad_norm": 0.3642532282080855, + "learning_rate": 1.1714857289770399e-07, + "loss": 0.3393, + "step": 6762 + }, + { + "epoch": 4.6900138696255205, + "grad_norm": 0.39553145474978363, + "learning_rate": 1.1662821233484167e-07, + "loss": 0.3451, + "step": 6763 + }, + { + "epoch": 4.690707350901525, + "grad_norm": 0.4529794811430911, + "learning_rate": 1.1610899639896034e-07, + "loss": 0.38, + "step": 6764 + }, + { + "epoch": 4.691400832177531, + "grad_norm": 0.3815225750665992, + "learning_rate": 1.155909252117593e-07, + "loss": 0.3064, + "step": 6765 + }, + { + "epoch": 4.692094313453537, + "grad_norm": 0.39076056913424856, + "learning_rate": 1.150739988946703e-07, + "loss": 0.3291, + "step": 6766 + }, + { + "epoch": 4.692787794729543, + "grad_norm": 0.4061633583383636, + "learning_rate": 1.145582175688581e-07, + "loss": 0.3567, + "step": 6767 + }, + { + "epoch": 4.6934812760055475, + "grad_norm": 0.40762646298708616, + "learning_rate": 1.140435813552171e-07, + "loss": 0.3494, + "step": 6768 + }, + { + "epoch": 4.694174757281553, + "grad_norm": 0.4105204089115084, + "learning_rate": 1.1353009037437523e-07, + "loss": 0.3817, + "step": 6769 + }, + { + "epoch": 4.694868238557559, + "grad_norm": 0.4415513186430572, + "learning_rate": 1.1301774474669125e-07, + "loss": 0.3537, + "step": 6770 + }, + { + "epoch": 4.695561719833565, + "grad_norm": 0.4306115816350744, + "learning_rate": 1.1250654459225407e-07, + "loss": 0.3422, + "step": 6771 + }, + { + "epoch": 4.69625520110957, + "grad_norm": 0.4292024895730401, + "learning_rate": 1.1199649003088619e-07, + "loss": 0.4078, + "step": 6772 + }, + { + "epoch": 4.6969486823855755, + "grad_norm": 0.40697351675677496, + "learning_rate": 1.1148758118214087e-07, + "loss": 0.3638, + "step": 6773 + }, + { + "epoch": 4.697642163661581, + "grad_norm": 0.4135029922149521, + "learning_rate": 1.1097981816530157e-07, + "loss": 0.3253, + "step": 6774 + }, + { + "epoch": 4.698335644937587, + "grad_norm": 0.384596218410337, + "learning_rate": 1.104732010993853e-07, + "loss": 0.3482, + "step": 6775 + }, + { + "epoch": 4.699029126213592, + "grad_norm": 0.5598074012488575, + "learning_rate": 1.0996773010313876e-07, + "loss": 0.3398, + "step": 6776 + }, + { + "epoch": 4.699722607489598, + "grad_norm": 0.4235991723272917, + "learning_rate": 1.0946340529504108e-07, + "loss": 0.3598, + "step": 6777 + }, + { + "epoch": 4.7004160887656035, + "grad_norm": 0.4191143944749233, + "learning_rate": 1.0896022679330265e-07, + "loss": 0.3313, + "step": 6778 + }, + { + "epoch": 4.701109570041609, + "grad_norm": 0.45766788222001814, + "learning_rate": 1.0845819471586416e-07, + "loss": 0.3375, + "step": 6779 + }, + { + "epoch": 4.701803051317614, + "grad_norm": 0.41868433977160835, + "learning_rate": 1.0795730918039871e-07, + "loss": 0.3534, + "step": 6780 + }, + { + "epoch": 4.70249653259362, + "grad_norm": 0.4253257730744769, + "learning_rate": 1.0745757030431015e-07, + "loss": 0.3524, + "step": 6781 + }, + { + "epoch": 4.703190013869626, + "grad_norm": 0.41838230902361123, + "learning_rate": 1.0695897820473367e-07, + "loss": 0.3788, + "step": 6782 + }, + { + "epoch": 4.703883495145631, + "grad_norm": 0.4365048673807159, + "learning_rate": 1.0646153299853523e-07, + "loss": 0.3748, + "step": 6783 + }, + { + "epoch": 4.704576976421636, + "grad_norm": 0.4488075569062203, + "learning_rate": 1.059652348023138e-07, + "loss": 0.3357, + "step": 6784 + }, + { + "epoch": 4.705270457697642, + "grad_norm": 0.5641354438931263, + "learning_rate": 1.0547008373239576e-07, + "loss": 0.3453, + "step": 6785 + }, + { + "epoch": 4.705963938973648, + "grad_norm": 0.40413271735483397, + "learning_rate": 1.0497607990484326e-07, + "loss": 0.3776, + "step": 6786 + }, + { + "epoch": 4.706657420249654, + "grad_norm": 0.43243871522736055, + "learning_rate": 1.0448322343544537e-07, + "loss": 0.3614, + "step": 6787 + }, + { + "epoch": 4.7073509015256585, + "grad_norm": 0.4309666969442699, + "learning_rate": 1.0399151443972521e-07, + "loss": 0.3593, + "step": 6788 + }, + { + "epoch": 4.708044382801664, + "grad_norm": 0.4122702124215238, + "learning_rate": 1.0350095303293617e-07, + "loss": 0.3323, + "step": 6789 + }, + { + "epoch": 4.70873786407767, + "grad_norm": 0.4281984543047366, + "learning_rate": 1.0301153933006126e-07, + "loss": 0.3159, + "step": 6790 + }, + { + "epoch": 4.709431345353676, + "grad_norm": 0.41312288396118063, + "learning_rate": 1.0252327344581592e-07, + "loss": 0.3517, + "step": 6791 + }, + { + "epoch": 4.710124826629681, + "grad_norm": 0.39005107427814484, + "learning_rate": 1.0203615549464585e-07, + "loss": 0.3481, + "step": 6792 + }, + { + "epoch": 4.7108183079056865, + "grad_norm": 0.40156674774110573, + "learning_rate": 1.0155018559072805e-07, + "loss": 0.3428, + "step": 6793 + }, + { + "epoch": 4.711511789181692, + "grad_norm": 0.40568518077027116, + "learning_rate": 1.0106536384797083e-07, + "loss": 0.3374, + "step": 6794 + }, + { + "epoch": 4.712205270457698, + "grad_norm": 0.4125994683015578, + "learning_rate": 1.005816903800122e-07, + "loss": 0.3854, + "step": 6795 + }, + { + "epoch": 4.712898751733703, + "grad_norm": 0.4632445191240259, + "learning_rate": 1.0009916530022256e-07, + "loss": 0.3684, + "step": 6796 + }, + { + "epoch": 4.713592233009709, + "grad_norm": 0.3925149370949742, + "learning_rate": 9.961778872170202e-08, + "loss": 0.3358, + "step": 6797 + }, + { + "epoch": 4.714285714285714, + "grad_norm": 0.3987378195867283, + "learning_rate": 9.913756075728088e-08, + "loss": 0.3658, + "step": 6798 + }, + { + "epoch": 4.71497919556172, + "grad_norm": 0.4263902974832491, + "learning_rate": 9.86584815195224e-08, + "loss": 0.3387, + "step": 6799 + }, + { + "epoch": 4.715672676837725, + "grad_norm": 0.4141152908935563, + "learning_rate": 9.818055112071844e-08, + "loss": 0.3717, + "step": 6800 + }, + { + "epoch": 4.716366158113731, + "grad_norm": 0.4103975056095425, + "learning_rate": 9.770376967289219e-08, + "loss": 0.3636, + "step": 6801 + }, + { + "epoch": 4.717059639389737, + "grad_norm": 0.4047969443606198, + "learning_rate": 9.722813728779923e-08, + "loss": 0.3545, + "step": 6802 + }, + { + "epoch": 4.717753120665742, + "grad_norm": 0.4005027983543377, + "learning_rate": 9.675365407692205e-08, + "loss": 0.3784, + "step": 6803 + }, + { + "epoch": 4.718446601941747, + "grad_norm": 0.4525688038167993, + "learning_rate": 9.628032015147836e-08, + "loss": 0.3589, + "step": 6804 + }, + { + "epoch": 4.719140083217753, + "grad_norm": 0.4132518708972397, + "learning_rate": 9.580813562241276e-08, + "loss": 0.3479, + "step": 6805 + }, + { + "epoch": 4.719833564493759, + "grad_norm": 0.44356132531979187, + "learning_rate": 9.533710060040224e-08, + "loss": 0.3347, + "step": 6806 + }, + { + "epoch": 4.720527045769765, + "grad_norm": 0.43176070223402224, + "learning_rate": 9.486721519585462e-08, + "loss": 0.3678, + "step": 6807 + }, + { + "epoch": 4.721220527045769, + "grad_norm": 0.38632097807987065, + "learning_rate": 9.43984795189068e-08, + "loss": 0.3258, + "step": 6808 + }, + { + "epoch": 4.721914008321775, + "grad_norm": 0.4139292071412524, + "learning_rate": 9.393089367942754e-08, + "loss": 0.3029, + "step": 6809 + }, + { + "epoch": 4.722607489597781, + "grad_norm": 0.3774464897660259, + "learning_rate": 9.346445778701529e-08, + "loss": 0.318, + "step": 6810 + }, + { + "epoch": 4.723300970873787, + "grad_norm": 0.44178754709316437, + "learning_rate": 9.299917195099928e-08, + "loss": 0.3236, + "step": 6811 + }, + { + "epoch": 4.723994452149792, + "grad_norm": 0.4328788907551428, + "learning_rate": 9.253503628043947e-08, + "loss": 0.3591, + "step": 6812 + }, + { + "epoch": 4.724687933425797, + "grad_norm": 0.40237448569599843, + "learning_rate": 9.207205088412496e-08, + "loss": 0.3346, + "step": 6813 + }, + { + "epoch": 4.725381414701803, + "grad_norm": 0.400456602925986, + "learning_rate": 9.161021587057728e-08, + "loss": 0.3648, + "step": 6814 + }, + { + "epoch": 4.726074895977809, + "grad_norm": 0.4081929175488941, + "learning_rate": 9.114953134804705e-08, + "loss": 0.3569, + "step": 6815 + }, + { + "epoch": 4.726768377253814, + "grad_norm": 0.4638939004745626, + "learning_rate": 9.068999742451456e-08, + "loss": 0.3399, + "step": 6816 + }, + { + "epoch": 4.72746185852982, + "grad_norm": 0.41917451542274065, + "learning_rate": 9.0231614207692e-08, + "loss": 0.3161, + "step": 6817 + }, + { + "epoch": 4.728155339805825, + "grad_norm": 0.6287713531427881, + "learning_rate": 8.977438180502118e-08, + "loss": 0.4059, + "step": 6818 + }, + { + "epoch": 4.728848821081831, + "grad_norm": 0.390518839793498, + "learning_rate": 8.931830032367361e-08, + "loss": 0.3367, + "step": 6819 + }, + { + "epoch": 4.729542302357836, + "grad_norm": 0.4017996211638298, + "learning_rate": 8.8863369870551e-08, + "loss": 0.3519, + "step": 6820 + }, + { + "epoch": 4.730235783633842, + "grad_norm": 0.4079111533114061, + "learning_rate": 8.840959055228693e-08, + "loss": 0.3568, + "step": 6821 + }, + { + "epoch": 4.7309292649098476, + "grad_norm": 0.40067820562618406, + "learning_rate": 8.7956962475243e-08, + "loss": 0.3518, + "step": 6822 + }, + { + "epoch": 4.731622746185853, + "grad_norm": 0.39752367168541325, + "learning_rate": 8.75054857455132e-08, + "loss": 0.3396, + "step": 6823 + }, + { + "epoch": 4.732316227461858, + "grad_norm": 0.5247590165452723, + "learning_rate": 8.705516046891905e-08, + "loss": 0.3825, + "step": 6824 + }, + { + "epoch": 4.733009708737864, + "grad_norm": 0.41551390206060973, + "learning_rate": 8.660598675101384e-08, + "loss": 0.3569, + "step": 6825 + }, + { + "epoch": 4.73370319001387, + "grad_norm": 0.4039738878181958, + "learning_rate": 8.615796469708171e-08, + "loss": 0.356, + "step": 6826 + }, + { + "epoch": 4.7343966712898755, + "grad_norm": 0.4116222758803286, + "learning_rate": 8.57110944121342e-08, + "loss": 0.3487, + "step": 6827 + }, + { + "epoch": 4.73509015256588, + "grad_norm": 0.43387076097437016, + "learning_rate": 8.526537600091477e-08, + "loss": 0.3653, + "step": 6828 + }, + { + "epoch": 4.735783633841886, + "grad_norm": 0.4246650898209524, + "learning_rate": 8.482080956789817e-08, + "loss": 0.3437, + "step": 6829 + }, + { + "epoch": 4.736477115117892, + "grad_norm": 0.4275177627205395, + "learning_rate": 8.437739521728549e-08, + "loss": 0.3107, + "step": 6830 + }, + { + "epoch": 4.737170596393898, + "grad_norm": 0.3775912044979139, + "learning_rate": 8.393513305301138e-08, + "loss": 0.3391, + "step": 6831 + }, + { + "epoch": 4.737864077669903, + "grad_norm": 0.423519338366062, + "learning_rate": 8.34940231787379e-08, + "loss": 0.3995, + "step": 6832 + }, + { + "epoch": 4.738557558945908, + "grad_norm": 0.4584113477802442, + "learning_rate": 8.305406569785845e-08, + "loss": 0.333, + "step": 6833 + }, + { + "epoch": 4.739251040221914, + "grad_norm": 0.4293005261696209, + "learning_rate": 8.261526071349613e-08, + "loss": 0.3808, + "step": 6834 + }, + { + "epoch": 4.73994452149792, + "grad_norm": 0.422761223569215, + "learning_rate": 8.217760832850308e-08, + "loss": 0.3646, + "step": 6835 + }, + { + "epoch": 4.740638002773925, + "grad_norm": 0.4081122434989935, + "learning_rate": 8.174110864546225e-08, + "loss": 0.3827, + "step": 6836 + }, + { + "epoch": 4.7413314840499305, + "grad_norm": 0.5276095426233536, + "learning_rate": 8.13057617666857e-08, + "loss": 0.3406, + "step": 6837 + }, + { + "epoch": 4.742024965325936, + "grad_norm": 0.4446229105695271, + "learning_rate": 8.087156779421512e-08, + "loss": 0.3115, + "step": 6838 + }, + { + "epoch": 4.742718446601942, + "grad_norm": 0.40921992617296, + "learning_rate": 8.043852682982356e-08, + "loss": 0.3818, + "step": 6839 + }, + { + "epoch": 4.743411927877947, + "grad_norm": 0.44700400369817855, + "learning_rate": 8.000663897501259e-08, + "loss": 0.3355, + "step": 6840 + }, + { + "epoch": 4.744105409153953, + "grad_norm": 0.4047666969145994, + "learning_rate": 7.957590433101293e-08, + "loss": 0.3449, + "step": 6841 + }, + { + "epoch": 4.7447988904299585, + "grad_norm": 0.44510310727835206, + "learning_rate": 7.914632299878544e-08, + "loss": 0.3143, + "step": 6842 + }, + { + "epoch": 4.745492371705964, + "grad_norm": 0.44913663374622265, + "learning_rate": 7.871789507902183e-08, + "loss": 0.3911, + "step": 6843 + }, + { + "epoch": 4.746185852981969, + "grad_norm": 0.38812213867453554, + "learning_rate": 7.829062067214233e-08, + "loss": 0.3142, + "step": 6844 + }, + { + "epoch": 4.746879334257975, + "grad_norm": 0.3950616638944297, + "learning_rate": 7.78644998782957e-08, + "loss": 0.3325, + "step": 6845 + }, + { + "epoch": 4.747572815533981, + "grad_norm": 0.38795078335508976, + "learning_rate": 7.743953279736315e-08, + "loss": 0.3278, + "step": 6846 + }, + { + "epoch": 4.7482662968099865, + "grad_norm": 0.47077018282848426, + "learning_rate": 7.701571952895337e-08, + "loss": 0.3517, + "step": 6847 + }, + { + "epoch": 4.748959778085991, + "grad_norm": 0.42275888126951844, + "learning_rate": 7.659306017240464e-08, + "loss": 0.3799, + "step": 6848 + }, + { + "epoch": 4.749653259361997, + "grad_norm": 0.38755631522184725, + "learning_rate": 7.617155482678607e-08, + "loss": 0.3589, + "step": 6849 + }, + { + "epoch": 4.750346740638003, + "grad_norm": 0.4842499370481706, + "learning_rate": 7.575120359089416e-08, + "loss": 0.339, + "step": 6850 + }, + { + "epoch": 4.751040221914009, + "grad_norm": 0.41237035293335705, + "learning_rate": 7.53320065632579e-08, + "loss": 0.3193, + "step": 6851 + }, + { + "epoch": 4.7517337031900135, + "grad_norm": 0.4025419440534093, + "learning_rate": 7.491396384213312e-08, + "loss": 0.379, + "step": 6852 + }, + { + "epoch": 4.752427184466019, + "grad_norm": 0.39427184180682384, + "learning_rate": 7.449707552550533e-08, + "loss": 0.3016, + "step": 6853 + }, + { + "epoch": 4.753120665742025, + "grad_norm": 0.42822774283365916, + "learning_rate": 7.408134171109138e-08, + "loss": 0.323, + "step": 6854 + }, + { + "epoch": 4.753814147018031, + "grad_norm": 0.3921444185820582, + "learning_rate": 7.366676249633609e-08, + "loss": 0.3434, + "step": 6855 + }, + { + "epoch": 4.754507628294036, + "grad_norm": 0.4685555826086588, + "learning_rate": 7.325333797841283e-08, + "loss": 0.3641, + "step": 6856 + }, + { + "epoch": 4.7552011095700415, + "grad_norm": 0.3974537006608089, + "learning_rate": 7.284106825422632e-08, + "loss": 0.3623, + "step": 6857 + }, + { + "epoch": 4.755894590846047, + "grad_norm": 0.5464280208635189, + "learning_rate": 7.242995342040926e-08, + "loss": 0.3169, + "step": 6858 + }, + { + "epoch": 4.756588072122053, + "grad_norm": 0.43377542822096865, + "learning_rate": 7.201999357332346e-08, + "loss": 0.3472, + "step": 6859 + }, + { + "epoch": 4.757281553398058, + "grad_norm": 0.39961496026766874, + "learning_rate": 7.161118880906203e-08, + "loss": 0.3585, + "step": 6860 + }, + { + "epoch": 4.757975034674064, + "grad_norm": 0.40510658913640607, + "learning_rate": 7.120353922344447e-08, + "loss": 0.3359, + "step": 6861 + }, + { + "epoch": 4.7586685159500695, + "grad_norm": 0.40455382978082327, + "learning_rate": 7.079704491202099e-08, + "loss": 0.365, + "step": 6862 + }, + { + "epoch": 4.759361997226075, + "grad_norm": 0.39163038288764424, + "learning_rate": 7.03917059700715e-08, + "loss": 0.351, + "step": 6863 + }, + { + "epoch": 4.76005547850208, + "grad_norm": 0.7175277727970721, + "learning_rate": 6.998752249260387e-08, + "loss": 0.3135, + "step": 6864 + }, + { + "epoch": 4.760748959778086, + "grad_norm": 0.39138433989798443, + "learning_rate": 6.958449457435679e-08, + "loss": 0.3394, + "step": 6865 + }, + { + "epoch": 4.761442441054092, + "grad_norm": 0.4845060873969994, + "learning_rate": 6.918262230979577e-08, + "loss": 0.3646, + "step": 6866 + }, + { + "epoch": 4.762135922330097, + "grad_norm": 0.39139535674858816, + "learning_rate": 6.878190579311772e-08, + "loss": 0.3631, + "step": 6867 + }, + { + "epoch": 4.762829403606102, + "grad_norm": 0.42219273468760327, + "learning_rate": 6.838234511824748e-08, + "loss": 0.3339, + "step": 6868 + }, + { + "epoch": 4.763522884882108, + "grad_norm": 0.3830655678826751, + "learning_rate": 6.798394037883904e-08, + "loss": 0.3602, + "step": 6869 + }, + { + "epoch": 4.764216366158114, + "grad_norm": 0.558477613490607, + "learning_rate": 6.758669166827547e-08, + "loss": 0.3377, + "step": 6870 + }, + { + "epoch": 4.76490984743412, + "grad_norm": 0.4234565855302753, + "learning_rate": 6.719059907966952e-08, + "loss": 0.3641, + "step": 6871 + }, + { + "epoch": 4.7656033287101245, + "grad_norm": 0.4187831061714325, + "learning_rate": 6.679566270586191e-08, + "loss": 0.3299, + "step": 6872 + }, + { + "epoch": 4.76629680998613, + "grad_norm": 0.5000926901157373, + "learning_rate": 6.640188263942248e-08, + "loss": 0.3728, + "step": 6873 + }, + { + "epoch": 4.766990291262136, + "grad_norm": 0.4304122396360334, + "learning_rate": 6.600925897265187e-08, + "loss": 0.3734, + "step": 6874 + }, + { + "epoch": 4.767683772538142, + "grad_norm": 0.4123609497597858, + "learning_rate": 6.561779179757644e-08, + "loss": 0.3527, + "step": 6875 + }, + { + "epoch": 4.768377253814147, + "grad_norm": 0.395750825503072, + "learning_rate": 6.5227481205955e-08, + "loss": 0.3338, + "step": 6876 + }, + { + "epoch": 4.7690707350901524, + "grad_norm": 0.4360803428959288, + "learning_rate": 6.483832728927219e-08, + "loss": 0.3396, + "step": 6877 + }, + { + "epoch": 4.769764216366158, + "grad_norm": 0.384209773552081, + "learning_rate": 6.44503301387428e-08, + "loss": 0.3377, + "step": 6878 + }, + { + "epoch": 4.770457697642164, + "grad_norm": 0.3962186906657134, + "learning_rate": 6.406348984531241e-08, + "loss": 0.3549, + "step": 6879 + }, + { + "epoch": 4.771151178918169, + "grad_norm": 0.5821520902820478, + "learning_rate": 6.367780649965127e-08, + "loss": 0.3619, + "step": 6880 + }, + { + "epoch": 4.771844660194175, + "grad_norm": 0.48422951675088144, + "learning_rate": 6.329328019216208e-08, + "loss": 0.3458, + "step": 6881 + }, + { + "epoch": 4.77253814147018, + "grad_norm": 0.41427924570498564, + "learning_rate": 6.290991101297495e-08, + "loss": 0.3447, + "step": 6882 + }, + { + "epoch": 4.773231622746186, + "grad_norm": 0.5338804788176426, + "learning_rate": 6.2527699051948e-08, + "loss": 0.3732, + "step": 6883 + }, + { + "epoch": 4.773925104022191, + "grad_norm": 0.40049217356679934, + "learning_rate": 6.214664439866957e-08, + "loss": 0.3441, + "step": 6884 + }, + { + "epoch": 4.774618585298197, + "grad_norm": 0.40061440425441786, + "learning_rate": 6.176674714245656e-08, + "loss": 0.3554, + "step": 6885 + }, + { + "epoch": 4.775312066574203, + "grad_norm": 0.3919407783142851, + "learning_rate": 6.138800737235384e-08, + "loss": 0.3345, + "step": 6886 + }, + { + "epoch": 4.776005547850208, + "grad_norm": 0.4344771229222764, + "learning_rate": 6.101042517713429e-08, + "loss": 0.3489, + "step": 6887 + }, + { + "epoch": 4.776699029126213, + "grad_norm": 0.4195158216513353, + "learning_rate": 6.063400064530155e-08, + "loss": 0.3707, + "step": 6888 + }, + { + "epoch": 4.777392510402219, + "grad_norm": 0.39922307345510843, + "learning_rate": 6.025873386508673e-08, + "loss": 0.3612, + "step": 6889 + }, + { + "epoch": 4.778085991678225, + "grad_norm": 0.39642192644407176, + "learning_rate": 5.988462492444946e-08, + "loss": 0.3567, + "step": 6890 + }, + { + "epoch": 4.778779472954231, + "grad_norm": 0.40566625595319633, + "learning_rate": 5.9511673911077924e-08, + "loss": 0.3342, + "step": 6891 + }, + { + "epoch": 4.779472954230235, + "grad_norm": 0.4597075486751991, + "learning_rate": 5.913988091238943e-08, + "loss": 0.3297, + "step": 6892 + }, + { + "epoch": 4.780166435506241, + "grad_norm": 0.4340545199423955, + "learning_rate": 5.876924601552869e-08, + "loss": 0.3468, + "step": 6893 + }, + { + "epoch": 4.780859916782247, + "grad_norm": 0.3953802526566802, + "learning_rate": 5.839976930737179e-08, + "loss": 0.3512, + "step": 6894 + }, + { + "epoch": 4.781553398058253, + "grad_norm": 0.4267458150677247, + "learning_rate": 5.803145087451945e-08, + "loss": 0.3467, + "step": 6895 + }, + { + "epoch": 4.782246879334258, + "grad_norm": 0.4879897653854673, + "learning_rate": 5.766429080330371e-08, + "loss": 0.3269, + "step": 6896 + }, + { + "epoch": 4.782940360610263, + "grad_norm": 0.40101218093549146, + "learning_rate": 5.729828917978464e-08, + "loss": 0.3506, + "step": 6897 + }, + { + "epoch": 4.783633841886269, + "grad_norm": 0.40949889228304925, + "learning_rate": 5.693344608974916e-08, + "loss": 0.3548, + "step": 6898 + }, + { + "epoch": 4.784327323162275, + "grad_norm": 0.4324929112027942, + "learning_rate": 5.656976161871497e-08, + "loss": 0.3504, + "step": 6899 + }, + { + "epoch": 4.78502080443828, + "grad_norm": 0.41618782158161427, + "learning_rate": 5.620723585192667e-08, + "loss": 0.3907, + "step": 6900 + }, + { + "epoch": 4.785714285714286, + "grad_norm": 0.4244362889494322, + "learning_rate": 5.584586887435739e-08, + "loss": 0.3647, + "step": 6901 + }, + { + "epoch": 4.786407766990291, + "grad_norm": 0.40496961206327814, + "learning_rate": 5.5485660770709385e-08, + "loss": 0.3231, + "step": 6902 + }, + { + "epoch": 4.787101248266297, + "grad_norm": 0.6944059890736237, + "learning_rate": 5.512661162541233e-08, + "loss": 0.3278, + "step": 6903 + }, + { + "epoch": 4.787794729542302, + "grad_norm": 0.406457267015003, + "learning_rate": 5.476872152262558e-08, + "loss": 0.3295, + "step": 6904 + }, + { + "epoch": 4.788488210818308, + "grad_norm": 0.3981263206641865, + "learning_rate": 5.441199054623536e-08, + "loss": 0.3107, + "step": 6905 + }, + { + "epoch": 4.7891816920943135, + "grad_norm": 0.38689615703998426, + "learning_rate": 5.405641877985646e-08, + "loss": 0.3452, + "step": 6906 + }, + { + "epoch": 4.789875173370319, + "grad_norm": 0.4229570411223415, + "learning_rate": 5.370200630683331e-08, + "loss": 0.3209, + "step": 6907 + }, + { + "epoch": 4.790568654646324, + "grad_norm": 0.38626318053258785, + "learning_rate": 5.3348753210237244e-08, + "loss": 0.3518, + "step": 6908 + }, + { + "epoch": 4.79126213592233, + "grad_norm": 0.4802303893011554, + "learning_rate": 5.2996659572867595e-08, + "loss": 0.3545, + "step": 6909 + }, + { + "epoch": 4.791955617198336, + "grad_norm": 0.4015189840259108, + "learning_rate": 5.2645725477252775e-08, + "loss": 0.3443, + "step": 6910 + }, + { + "epoch": 4.7926490984743415, + "grad_norm": 0.45766271874627884, + "learning_rate": 5.229595100564977e-08, + "loss": 0.3515, + "step": 6911 + }, + { + "epoch": 4.793342579750346, + "grad_norm": 0.39156170175293953, + "learning_rate": 5.1947336240043e-08, + "loss": 0.3771, + "step": 6912 + }, + { + "epoch": 4.794036061026352, + "grad_norm": 0.3989951226746273, + "learning_rate": 5.159988126214543e-08, + "loss": 0.3236, + "step": 6913 + }, + { + "epoch": 4.794729542302358, + "grad_norm": 0.4008655698322317, + "learning_rate": 5.1253586153397485e-08, + "loss": 0.3666, + "step": 6914 + }, + { + "epoch": 4.795423023578364, + "grad_norm": 0.409899447239586, + "learning_rate": 5.090845099496866e-08, + "loss": 0.3416, + "step": 6915 + }, + { + "epoch": 4.796116504854369, + "grad_norm": 0.4623452915993466, + "learning_rate": 5.0564475867755924e-08, + "loss": 0.3604, + "step": 6916 + }, + { + "epoch": 4.796809986130374, + "grad_norm": 0.390243403261641, + "learning_rate": 5.0221660852384226e-08, + "loss": 0.3196, + "step": 6917 + }, + { + "epoch": 4.79750346740638, + "grad_norm": 0.410900232085877, + "learning_rate": 4.988000602920706e-08, + "loss": 0.3691, + "step": 6918 + }, + { + "epoch": 4.798196948682386, + "grad_norm": 0.4084424977197252, + "learning_rate": 4.953951147830649e-08, + "loss": 0.3458, + "step": 6919 + }, + { + "epoch": 4.798890429958391, + "grad_norm": 0.4265428979318083, + "learning_rate": 4.920017727949089e-08, + "loss": 0.327, + "step": 6920 + }, + { + "epoch": 4.7995839112343965, + "grad_norm": 0.42090461960905484, + "learning_rate": 4.886200351229886e-08, + "loss": 0.3831, + "step": 6921 + }, + { + "epoch": 4.800277392510402, + "grad_norm": 0.4345928866202194, + "learning_rate": 4.852499025599533e-08, + "loss": 0.3448, + "step": 6922 + }, + { + "epoch": 4.800970873786408, + "grad_norm": 0.44982911549267557, + "learning_rate": 4.818913758957378e-08, + "loss": 0.3627, + "step": 6923 + }, + { + "epoch": 4.801664355062413, + "grad_norm": 0.41447436912158064, + "learning_rate": 4.785444559175567e-08, + "loss": 0.3705, + "step": 6924 + }, + { + "epoch": 4.802357836338419, + "grad_norm": 0.42229609729339795, + "learning_rate": 4.752091434099049e-08, + "loss": 0.3869, + "step": 6925 + }, + { + "epoch": 4.8030513176144245, + "grad_norm": 0.4170527689602492, + "learning_rate": 4.718854391545569e-08, + "loss": 0.3318, + "step": 6926 + }, + { + "epoch": 4.80374479889043, + "grad_norm": 0.40702953087802257, + "learning_rate": 4.685733439305562e-08, + "loss": 0.3162, + "step": 6927 + }, + { + "epoch": 4.804438280166435, + "grad_norm": 0.37413577192538516, + "learning_rate": 4.6527285851424295e-08, + "loss": 0.3493, + "step": 6928 + }, + { + "epoch": 4.805131761442441, + "grad_norm": 0.5599324085959952, + "learning_rate": 4.619839836792261e-08, + "loss": 0.3631, + "step": 6929 + }, + { + "epoch": 4.805825242718447, + "grad_norm": 0.4062003685428794, + "learning_rate": 4.5870672019638905e-08, + "loss": 0.3622, + "step": 6930 + }, + { + "epoch": 4.8065187239944525, + "grad_norm": 0.37795181480569495, + "learning_rate": 4.5544106883390614e-08, + "loss": 0.3004, + "step": 6931 + }, + { + "epoch": 4.807212205270457, + "grad_norm": 0.3852526524395699, + "learning_rate": 4.5218703035721514e-08, + "loss": 0.3241, + "step": 6932 + }, + { + "epoch": 4.807905686546463, + "grad_norm": 0.45727984178567427, + "learning_rate": 4.489446055290392e-08, + "loss": 0.359, + "step": 6933 + }, + { + "epoch": 4.808599167822469, + "grad_norm": 0.40414797748884745, + "learning_rate": 4.4571379510938705e-08, + "loss": 0.4459, + "step": 6934 + }, + { + "epoch": 4.809292649098475, + "grad_norm": 0.4095538258795409, + "learning_rate": 4.424945998555308e-08, + "loss": 0.3159, + "step": 6935 + }, + { + "epoch": 4.8099861303744795, + "grad_norm": 0.45794444590898525, + "learning_rate": 4.3928702052202786e-08, + "loss": 0.349, + "step": 6936 + }, + { + "epoch": 4.810679611650485, + "grad_norm": 0.4382399261276273, + "learning_rate": 4.360910578607158e-08, + "loss": 0.3866, + "step": 6937 + }, + { + "epoch": 4.811373092926491, + "grad_norm": 0.43964004852582556, + "learning_rate": 4.329067126206954e-08, + "loss": 0.3981, + "step": 6938 + }, + { + "epoch": 4.812066574202497, + "grad_norm": 0.4359603259960818, + "learning_rate": 4.29733985548364e-08, + "loss": 0.3818, + "step": 6939 + }, + { + "epoch": 4.812760055478502, + "grad_norm": 0.44589836821992757, + "learning_rate": 4.265728773873767e-08, + "loss": 0.3683, + "step": 6940 + }, + { + "epoch": 4.8134535367545075, + "grad_norm": 0.42218529636975183, + "learning_rate": 4.234233888786799e-08, + "loss": 0.3132, + "step": 6941 + }, + { + "epoch": 4.814147018030513, + "grad_norm": 0.38511463630655796, + "learning_rate": 4.202855207604939e-08, + "loss": 0.3505, + "step": 6942 + }, + { + "epoch": 4.814840499306519, + "grad_norm": 0.4037907780637292, + "learning_rate": 4.171592737683083e-08, + "loss": 0.3534, + "step": 6943 + }, + { + "epoch": 4.815533980582524, + "grad_norm": 0.7813746787339475, + "learning_rate": 4.1404464863489256e-08, + "loss": 0.327, + "step": 6944 + }, + { + "epoch": 4.81622746185853, + "grad_norm": 0.40146937333052046, + "learning_rate": 4.109416460902904e-08, + "loss": 0.338, + "step": 6945 + }, + { + "epoch": 4.8169209431345354, + "grad_norm": 0.6195966408370592, + "learning_rate": 4.078502668618256e-08, + "loss": 0.3607, + "step": 6946 + }, + { + "epoch": 4.817614424410541, + "grad_norm": 0.43914167443459606, + "learning_rate": 4.0477051167410185e-08, + "loss": 0.3964, + "step": 6947 + }, + { + "epoch": 4.818307905686546, + "grad_norm": 0.4494699128783925, + "learning_rate": 4.017023812489751e-08, + "loss": 0.363, + "step": 6948 + }, + { + "epoch": 4.819001386962552, + "grad_norm": 0.39687876668104566, + "learning_rate": 3.986458763056089e-08, + "loss": 0.3419, + "step": 6949 + }, + { + "epoch": 4.819694868238558, + "grad_norm": 0.5078209870651843, + "learning_rate": 3.9560099756041915e-08, + "loss": 0.3593, + "step": 6950 + }, + { + "epoch": 4.820388349514563, + "grad_norm": 0.38621878624607414, + "learning_rate": 3.9256774572710157e-08, + "loss": 0.341, + "step": 6951 + }, + { + "epoch": 4.821081830790568, + "grad_norm": 0.7258465363588564, + "learning_rate": 3.8954612151663184e-08, + "loss": 0.3673, + "step": 6952 + }, + { + "epoch": 4.821775312066574, + "grad_norm": 0.4371026524517926, + "learning_rate": 3.8653612563725465e-08, + "loss": 0.3405, + "step": 6953 + }, + { + "epoch": 4.82246879334258, + "grad_norm": 0.49113893094781175, + "learning_rate": 3.835377587944944e-08, + "loss": 0.34, + "step": 6954 + }, + { + "epoch": 4.823162274618586, + "grad_norm": 0.4705036596618689, + "learning_rate": 3.80551021691139e-08, + "loss": 0.3671, + "step": 6955 + }, + { + "epoch": 4.8238557558945905, + "grad_norm": 0.4424351103421927, + "learning_rate": 3.775759150272673e-08, + "loss": 0.372, + "step": 6956 + }, + { + "epoch": 4.824549237170596, + "grad_norm": 0.4014604289703149, + "learning_rate": 3.74612439500216e-08, + "loss": 0.3226, + "step": 6957 + }, + { + "epoch": 4.825242718446602, + "grad_norm": 0.47077927144046877, + "learning_rate": 3.716605958046071e-08, + "loss": 0.3844, + "step": 6958 + }, + { + "epoch": 4.825936199722608, + "grad_norm": 0.3973943525985301, + "learning_rate": 3.687203846323262e-08, + "loss": 0.3468, + "step": 6959 + }, + { + "epoch": 4.826629680998613, + "grad_norm": 0.3913295607780083, + "learning_rate": 3.657918066725441e-08, + "loss": 0.3347, + "step": 6960 + }, + { + "epoch": 4.827323162274618, + "grad_norm": 0.38534171319172944, + "learning_rate": 3.6287486261169515e-08, + "loss": 0.3485, + "step": 6961 + }, + { + "epoch": 4.828016643550624, + "grad_norm": 0.5122972208847042, + "learning_rate": 3.599695531334879e-08, + "loss": 0.329, + "step": 6962 + }, + { + "epoch": 4.82871012482663, + "grad_norm": 0.4069284210103114, + "learning_rate": 3.570758789189055e-08, + "loss": 0.3655, + "step": 6963 + }, + { + "epoch": 4.829403606102635, + "grad_norm": 0.38485990839853634, + "learning_rate": 3.541938406462053e-08, + "loss": 0.3107, + "step": 6964 + }, + { + "epoch": 4.830097087378641, + "grad_norm": 0.42643408148264367, + "learning_rate": 3.513234389909192e-08, + "loss": 0.3255, + "step": 6965 + }, + { + "epoch": 4.830790568654646, + "grad_norm": 0.3914646908078493, + "learning_rate": 3.4846467462584796e-08, + "loss": 0.3308, + "step": 6966 + }, + { + "epoch": 4.831484049930652, + "grad_norm": 0.41991133893343485, + "learning_rate": 3.456175482210611e-08, + "loss": 0.3766, + "step": 6967 + }, + { + "epoch": 4.832177531206657, + "grad_norm": 0.4187980595385569, + "learning_rate": 3.4278206044390804e-08, + "loss": 0.3248, + "step": 6968 + }, + { + "epoch": 4.832871012482663, + "grad_norm": 0.4179467903424799, + "learning_rate": 3.399582119590072e-08, + "loss": 0.3271, + "step": 6969 + }, + { + "epoch": 4.833564493758669, + "grad_norm": 0.4452789265587225, + "learning_rate": 3.371460034282459e-08, + "loss": 0.351, + "step": 6970 + }, + { + "epoch": 4.834257975034674, + "grad_norm": 0.39743620351146214, + "learning_rate": 3.3434543551078555e-08, + "loss": 0.3335, + "step": 6971 + }, + { + "epoch": 4.834951456310679, + "grad_norm": 0.4425420257487699, + "learning_rate": 3.3155650886306236e-08, + "loss": 0.3394, + "step": 6972 + }, + { + "epoch": 4.835644937586685, + "grad_norm": 0.40050631569226564, + "learning_rate": 3.2877922413876994e-08, + "loss": 0.3335, + "step": 6973 + }, + { + "epoch": 4.836338418862691, + "grad_norm": 0.4264455119359866, + "learning_rate": 3.260135819888988e-08, + "loss": 0.3891, + "step": 6974 + }, + { + "epoch": 4.8370319001386965, + "grad_norm": 0.408461137740394, + "learning_rate": 3.232595830616858e-08, + "loss": 0.3499, + "step": 6975 + }, + { + "epoch": 4.837725381414701, + "grad_norm": 0.38137978335887013, + "learning_rate": 3.205172280026536e-08, + "loss": 0.3314, + "step": 6976 + }, + { + "epoch": 4.838418862690707, + "grad_norm": 0.38848118773462487, + "learning_rate": 3.1778651745458246e-08, + "loss": 0.3382, + "step": 6977 + }, + { + "epoch": 4.839112343966713, + "grad_norm": 0.3880703513525962, + "learning_rate": 3.1506745205753806e-08, + "loss": 0.3321, + "step": 6978 + }, + { + "epoch": 4.839805825242719, + "grad_norm": 0.42651294698344894, + "learning_rate": 3.123600324488496e-08, + "loss": 0.3189, + "step": 6979 + }, + { + "epoch": 4.840499306518724, + "grad_norm": 0.43988143678634084, + "learning_rate": 3.096642592631094e-08, + "loss": 0.349, + "step": 6980 + }, + { + "epoch": 4.841192787794729, + "grad_norm": 0.529721713784114, + "learning_rate": 3.069801331321953e-08, + "loss": 0.3297, + "step": 6981 + }, + { + "epoch": 4.841886269070735, + "grad_norm": 0.39465111595697205, + "learning_rate": 3.043076546852486e-08, + "loss": 0.3581, + "step": 6982 + }, + { + "epoch": 4.842579750346741, + "grad_norm": 0.4123035006107765, + "learning_rate": 3.0164682454866814e-08, + "loss": 0.3794, + "step": 6983 + }, + { + "epoch": 4.843273231622746, + "grad_norm": 0.7375004528984306, + "learning_rate": 2.989976433461439e-08, + "loss": 0.3477, + "step": 6984 + }, + { + "epoch": 4.843966712898752, + "grad_norm": 0.5067491560434093, + "learning_rate": 2.9636011169861812e-08, + "loss": 0.348, + "step": 6985 + }, + { + "epoch": 4.844660194174757, + "grad_norm": 0.4143473192391111, + "learning_rate": 2.9373423022431292e-08, + "loss": 0.3542, + "step": 6986 + }, + { + "epoch": 4.845353675450763, + "grad_norm": 0.6526855266680608, + "learning_rate": 2.9111999953871373e-08, + "loss": 0.3668, + "step": 6987 + }, + { + "epoch": 4.846047156726768, + "grad_norm": 0.4193674664460992, + "learning_rate": 2.885174202545804e-08, + "loss": 0.3619, + "step": 6988 + }, + { + "epoch": 4.846740638002774, + "grad_norm": 0.41134133442438553, + "learning_rate": 2.8592649298193053e-08, + "loss": 0.3185, + "step": 6989 + }, + { + "epoch": 4.8474341192787795, + "grad_norm": 0.40573112885511203, + "learning_rate": 2.8334721832807276e-08, + "loss": 0.3744, + "step": 6990 + }, + { + "epoch": 4.848127600554785, + "grad_norm": 0.40268972208204984, + "learning_rate": 2.8077959689755686e-08, + "loss": 0.3762, + "step": 6991 + }, + { + "epoch": 4.84882108183079, + "grad_norm": 0.39259809595310746, + "learning_rate": 2.7822362929221804e-08, + "loss": 0.3466, + "step": 6992 + }, + { + "epoch": 4.849514563106796, + "grad_norm": 0.4022798126883263, + "learning_rate": 2.7567931611116037e-08, + "loss": 0.3621, + "step": 6993 + }, + { + "epoch": 4.850208044382802, + "grad_norm": 0.39845692580513625, + "learning_rate": 2.7314665795075135e-08, + "loss": 0.395, + "step": 6994 + }, + { + "epoch": 4.8509015256588075, + "grad_norm": 0.3805788766910518, + "learning_rate": 2.7062565540462715e-08, + "loss": 0.326, + "step": 6995 + }, + { + "epoch": 4.851595006934812, + "grad_norm": 0.4235264999130844, + "learning_rate": 2.681163090636929e-08, + "loss": 0.3889, + "step": 6996 + }, + { + "epoch": 4.852288488210818, + "grad_norm": 0.3858796636544987, + "learning_rate": 2.65618619516117e-08, + "loss": 0.3616, + "step": 6997 + }, + { + "epoch": 4.852981969486824, + "grad_norm": 1.2476283436230053, + "learning_rate": 2.631325873473478e-08, + "loss": 0.3071, + "step": 6998 + }, + { + "epoch": 4.85367545076283, + "grad_norm": 0.38992115694265467, + "learning_rate": 2.6065821314009142e-08, + "loss": 0.3169, + "step": 6999 + }, + { + "epoch": 4.854368932038835, + "grad_norm": 0.6515897947207091, + "learning_rate": 2.581954974743117e-08, + "loss": 0.3497, + "step": 7000 + }, + { + "epoch": 4.85506241331484, + "grad_norm": 0.407057737738664, + "learning_rate": 2.5574444092726358e-08, + "loss": 0.3621, + "step": 7001 + }, + { + "epoch": 4.855755894590846, + "grad_norm": 0.47573275757093725, + "learning_rate": 2.5330504407345415e-08, + "loss": 0.3219, + "step": 7002 + }, + { + "epoch": 4.856449375866852, + "grad_norm": 0.3993589365632951, + "learning_rate": 2.508773074846649e-08, + "loss": 0.3433, + "step": 7003 + }, + { + "epoch": 4.857142857142857, + "grad_norm": 0.39260504467944574, + "learning_rate": 2.4846123172992953e-08, + "loss": 0.373, + "step": 7004 + }, + { + "epoch": 4.8578363384188625, + "grad_norm": 0.3861880075482633, + "learning_rate": 2.460568173755673e-08, + "loss": 0.3134, + "step": 7005 + }, + { + "epoch": 4.858529819694868, + "grad_norm": 0.4026053848786676, + "learning_rate": 2.436640649851496e-08, + "loss": 0.3872, + "step": 7006 + }, + { + "epoch": 4.859223300970874, + "grad_norm": 0.39099415234204743, + "learning_rate": 2.4128297511952227e-08, + "loss": 0.3499, + "step": 7007 + }, + { + "epoch": 4.859916782246879, + "grad_norm": 0.4116912765648552, + "learning_rate": 2.389135483367999e-08, + "loss": 0.3627, + "step": 7008 + }, + { + "epoch": 4.860610263522885, + "grad_norm": 0.442772142820646, + "learning_rate": 2.365557851923439e-08, + "loss": 0.3549, + "step": 7009 + }, + { + "epoch": 4.8613037447988905, + "grad_norm": 0.4048592213735465, + "learning_rate": 2.3420968623881768e-08, + "loss": 0.3153, + "step": 7010 + }, + { + "epoch": 4.861997226074896, + "grad_norm": 0.4128785983645319, + "learning_rate": 2.3187525202612028e-08, + "loss": 0.38, + "step": 7011 + }, + { + "epoch": 4.862690707350901, + "grad_norm": 0.4266549420850963, + "learning_rate": 2.295524831014251e-08, + "loss": 0.3895, + "step": 7012 + }, + { + "epoch": 4.863384188626907, + "grad_norm": 0.42120634597732115, + "learning_rate": 2.272413800091744e-08, + "loss": 0.3891, + "step": 7013 + }, + { + "epoch": 4.864077669902913, + "grad_norm": 0.40640859578001914, + "learning_rate": 2.249419432910682e-08, + "loss": 0.3423, + "step": 7014 + }, + { + "epoch": 4.8647711511789185, + "grad_norm": 0.3910487057475866, + "learning_rate": 2.2265417348608653e-08, + "loss": 0.3702, + "step": 7015 + }, + { + "epoch": 4.865464632454923, + "grad_norm": 0.4372197533274517, + "learning_rate": 2.203780711304615e-08, + "loss": 0.3635, + "step": 7016 + }, + { + "epoch": 4.866158113730929, + "grad_norm": 0.416883411335525, + "learning_rate": 2.1811363675769416e-08, + "loss": 0.3641, + "step": 7017 + }, + { + "epoch": 4.866851595006935, + "grad_norm": 0.3809042360514471, + "learning_rate": 2.1586087089855436e-08, + "loss": 0.3668, + "step": 7018 + }, + { + "epoch": 4.867545076282941, + "grad_norm": 0.4222391476538577, + "learning_rate": 2.136197740810697e-08, + "loss": 0.4006, + "step": 7019 + }, + { + "epoch": 4.8682385575589455, + "grad_norm": 0.3822094679396028, + "learning_rate": 2.1139034683054783e-08, + "loss": 0.3492, + "step": 7020 + }, + { + "epoch": 4.868932038834951, + "grad_norm": 1.3133856866351414, + "learning_rate": 2.0917258966953735e-08, + "loss": 0.3494, + "step": 7021 + }, + { + "epoch": 4.869625520110957, + "grad_norm": 0.4463626421740778, + "learning_rate": 2.069665031178669e-08, + "loss": 0.3427, + "step": 7022 + }, + { + "epoch": 4.870319001386963, + "grad_norm": 0.38051163270916527, + "learning_rate": 2.04772087692634e-08, + "loss": 0.3123, + "step": 7023 + }, + { + "epoch": 4.871012482662968, + "grad_norm": 0.410824829817691, + "learning_rate": 2.0258934390819386e-08, + "loss": 0.3313, + "step": 7024 + }, + { + "epoch": 4.8717059639389735, + "grad_norm": 0.39629206857971727, + "learning_rate": 2.0041827227615385e-08, + "loss": 0.3072, + "step": 7025 + }, + { + "epoch": 4.872399445214979, + "grad_norm": 0.4000981505989687, + "learning_rate": 1.9825887330540693e-08, + "loss": 0.3128, + "step": 7026 + }, + { + "epoch": 4.873092926490985, + "grad_norm": 0.7868279340490983, + "learning_rate": 1.9611114750209825e-08, + "loss": 0.3539, + "step": 7027 + }, + { + "epoch": 4.87378640776699, + "grad_norm": 0.3969314819095711, + "learning_rate": 1.9397509536964177e-08, + "loss": 0.3366, + "step": 7028 + }, + { + "epoch": 4.874479889042996, + "grad_norm": 0.3771405960249163, + "learning_rate": 1.9185071740871475e-08, + "loss": 0.3362, + "step": 7029 + }, + { + "epoch": 4.875173370319001, + "grad_norm": 0.3934923161098979, + "learning_rate": 1.8973801411724668e-08, + "loss": 0.3665, + "step": 7030 + }, + { + "epoch": 4.875866851595007, + "grad_norm": 0.5085863969277675, + "learning_rate": 1.87636985990447e-08, + "loss": 0.3547, + "step": 7031 + }, + { + "epoch": 4.876560332871012, + "grad_norm": 0.44542724936713324, + "learning_rate": 1.8554763352078288e-08, + "loss": 0.3308, + "step": 7032 + }, + { + "epoch": 4.877253814147018, + "grad_norm": 0.40118939164960027, + "learning_rate": 1.8346995719797366e-08, + "loss": 0.3711, + "step": 7033 + }, + { + "epoch": 4.877947295423024, + "grad_norm": 0.442827757482279, + "learning_rate": 1.814039575090243e-08, + "loss": 0.3765, + "step": 7034 + }, + { + "epoch": 4.878640776699029, + "grad_norm": 0.41765429340238064, + "learning_rate": 1.793496349381807e-08, + "loss": 0.3575, + "step": 7035 + }, + { + "epoch": 4.879334257975035, + "grad_norm": 0.44753901231903453, + "learning_rate": 1.773069899669633e-08, + "loss": 0.3795, + "step": 7036 + }, + { + "epoch": 4.88002773925104, + "grad_norm": 0.3958715474893592, + "learning_rate": 1.752760230741557e-08, + "loss": 0.3638, + "step": 7037 + }, + { + "epoch": 4.880721220527046, + "grad_norm": 0.3818979393266661, + "learning_rate": 1.7325673473580496e-08, + "loss": 0.3217, + "step": 7038 + }, + { + "epoch": 4.881414701803052, + "grad_norm": 0.4106820176143562, + "learning_rate": 1.7124912542520468e-08, + "loss": 0.3213, + "step": 7039 + }, + { + "epoch": 4.8821081830790565, + "grad_norm": 0.40224007247881505, + "learning_rate": 1.6925319561293953e-08, + "loss": 0.2979, + "step": 7040 + }, + { + "epoch": 4.882801664355062, + "grad_norm": 0.4812245227426676, + "learning_rate": 1.6726894576683527e-08, + "loss": 0.308, + "step": 7041 + }, + { + "epoch": 4.883495145631068, + "grad_norm": 0.4040953959083908, + "learning_rate": 1.6529637635198103e-08, + "loss": 0.3585, + "step": 7042 + }, + { + "epoch": 4.884188626907074, + "grad_norm": 0.4779342816171157, + "learning_rate": 1.633354878307347e-08, + "loss": 0.3156, + "step": 7043 + }, + { + "epoch": 4.8848821081830796, + "grad_norm": 0.41192796775293167, + "learning_rate": 1.6138628066271756e-08, + "loss": 0.3465, + "step": 7044 + }, + { + "epoch": 4.885575589459084, + "grad_norm": 0.42753611477598175, + "learning_rate": 1.594487553048085e-08, + "loss": 0.401, + "step": 7045 + }, + { + "epoch": 4.88626907073509, + "grad_norm": 0.39125203223697663, + "learning_rate": 1.575229122111499e-08, + "loss": 0.3145, + "step": 7046 + }, + { + "epoch": 4.886962552011096, + "grad_norm": 0.4320726198104153, + "learning_rate": 1.5560875183314172e-08, + "loss": 0.3577, + "step": 7047 + }, + { + "epoch": 4.887656033287101, + "grad_norm": 0.4639310463180436, + "learning_rate": 1.537062746194584e-08, + "loss": 0.3326, + "step": 7048 + }, + { + "epoch": 4.888349514563107, + "grad_norm": 0.4011769240495747, + "learning_rate": 1.518154810160155e-08, + "loss": 0.3695, + "step": 7049 + }, + { + "epoch": 4.889042995839112, + "grad_norm": 0.4083309067945322, + "learning_rate": 1.4993637146600848e-08, + "loss": 0.3475, + "step": 7050 + }, + { + "epoch": 4.889736477115118, + "grad_norm": 0.4385380049866903, + "learning_rate": 1.4806894640988501e-08, + "loss": 0.3924, + "step": 7051 + }, + { + "epoch": 4.890429958391124, + "grad_norm": 0.44989815131302324, + "learning_rate": 1.4621320628535051e-08, + "loss": 0.3389, + "step": 7052 + }, + { + "epoch": 4.891123439667129, + "grad_norm": 0.41299846546591695, + "learning_rate": 1.4436915152739039e-08, + "loss": 0.3327, + "step": 7053 + }, + { + "epoch": 4.891816920943135, + "grad_norm": 0.4180142352879065, + "learning_rate": 1.4253678256822e-08, + "loss": 0.3127, + "step": 7054 + }, + { + "epoch": 4.89251040221914, + "grad_norm": 0.4036307171221583, + "learning_rate": 1.4071609983735134e-08, + "loss": 0.324, + "step": 7055 + }, + { + "epoch": 4.893203883495145, + "grad_norm": 0.4269339794498483, + "learning_rate": 1.3890710376152638e-08, + "loss": 0.3415, + "step": 7056 + }, + { + "epoch": 4.893897364771151, + "grad_norm": 0.4126353564337846, + "learning_rate": 1.3710979476476705e-08, + "loss": 0.3294, + "step": 7057 + }, + { + "epoch": 4.894590846047157, + "grad_norm": 0.4174521233755747, + "learning_rate": 1.3532417326834746e-08, + "loss": 0.3769, + "step": 7058 + }, + { + "epoch": 4.8952843273231625, + "grad_norm": 0.4129989514917506, + "learning_rate": 1.3355023969080505e-08, + "loss": 0.3409, + "step": 7059 + }, + { + "epoch": 4.895977808599168, + "grad_norm": 0.44700227140726523, + "learning_rate": 1.3178799444794054e-08, + "loss": 0.36, + "step": 7060 + }, + { + "epoch": 4.896671289875173, + "grad_norm": 0.42706512180609374, + "learning_rate": 1.3003743795280133e-08, + "loss": 0.3839, + "step": 7061 + }, + { + "epoch": 4.897364771151179, + "grad_norm": 0.43007091027190886, + "learning_rate": 1.282985706157147e-08, + "loss": 0.371, + "step": 7062 + }, + { + "epoch": 4.898058252427185, + "grad_norm": 0.43856483768546134, + "learning_rate": 1.2657139284425468e-08, + "loss": 0.391, + "step": 7063 + }, + { + "epoch": 4.89875173370319, + "grad_norm": 0.4250172833116743, + "learning_rate": 1.248559050432585e-08, + "loss": 0.3798, + "step": 7064 + }, + { + "epoch": 4.899445214979195, + "grad_norm": 0.3825188785065298, + "learning_rate": 1.2315210761482676e-08, + "loss": 0.3687, + "step": 7065 + }, + { + "epoch": 4.900138696255201, + "grad_norm": 0.4270739073874196, + "learning_rate": 1.2146000095831777e-08, + "loss": 0.3225, + "step": 7066 + }, + { + "epoch": 4.900832177531207, + "grad_norm": 0.5045902051465194, + "learning_rate": 1.1977958547034207e-08, + "loss": 0.3759, + "step": 7067 + }, + { + "epoch": 4.901525658807213, + "grad_norm": 0.4327871423906362, + "learning_rate": 1.1811086154478458e-08, + "loss": 0.3883, + "step": 7068 + }, + { + "epoch": 4.902219140083218, + "grad_norm": 0.40798227694211514, + "learning_rate": 1.164538295727824e-08, + "loss": 0.3457, + "step": 7069 + }, + { + "epoch": 4.902912621359223, + "grad_norm": 0.4077172283402849, + "learning_rate": 1.1480848994272486e-08, + "loss": 0.3332, + "step": 7070 + }, + { + "epoch": 4.903606102635229, + "grad_norm": 0.4030325795048964, + "learning_rate": 1.131748430402757e-08, + "loss": 0.3578, + "step": 7071 + }, + { + "epoch": 4.904299583911234, + "grad_norm": 0.5523680722582051, + "learning_rate": 1.1155288924834529e-08, + "loss": 0.3861, + "step": 7072 + }, + { + "epoch": 4.90499306518724, + "grad_norm": 0.4189087197149499, + "learning_rate": 1.0994262894710728e-08, + "loss": 0.3381, + "step": 7073 + }, + { + "epoch": 4.9056865464632455, + "grad_norm": 0.5228980179247452, + "learning_rate": 1.083440625139931e-08, + "loss": 0.3184, + "step": 7074 + }, + { + "epoch": 4.906380027739251, + "grad_norm": 0.43055173519184403, + "learning_rate": 1.0675719032370303e-08, + "loss": 0.3496, + "step": 7075 + }, + { + "epoch": 4.907073509015257, + "grad_norm": 0.9588559473262878, + "learning_rate": 1.0518201274817841e-08, + "loss": 0.3406, + "step": 7076 + }, + { + "epoch": 4.907766990291262, + "grad_norm": 0.43695351069747884, + "learning_rate": 1.0361853015664058e-08, + "loss": 0.3519, + "step": 7077 + }, + { + "epoch": 4.908460471567268, + "grad_norm": 0.373248715528517, + "learning_rate": 1.0206674291555196e-08, + "loss": 0.3817, + "step": 7078 + }, + { + "epoch": 4.9091539528432735, + "grad_norm": 0.42625262911793366, + "learning_rate": 1.0052665138863827e-08, + "loss": 0.3848, + "step": 7079 + }, + { + "epoch": 4.909847434119278, + "grad_norm": 0.7145976646022153, + "learning_rate": 9.89982559368885e-09, + "loss": 0.3006, + "step": 7080 + }, + { + "epoch": 4.910540915395284, + "grad_norm": 0.41867967950863977, + "learning_rate": 9.748155691854943e-09, + "loss": 0.38, + "step": 7081 + }, + { + "epoch": 4.91123439667129, + "grad_norm": 0.44160569036967745, + "learning_rate": 9.59765546891256e-09, + "loss": 0.355, + "step": 7082 + }, + { + "epoch": 4.911927877947296, + "grad_norm": 0.4537941931538326, + "learning_rate": 9.448324960136812e-09, + "loss": 0.336, + "step": 7083 + }, + { + "epoch": 4.9126213592233015, + "grad_norm": 0.407616887547813, + "learning_rate": 9.300164200530815e-09, + "loss": 0.4169, + "step": 7084 + }, + { + "epoch": 4.913314840499306, + "grad_norm": 0.38994637144499206, + "learning_rate": 9.153173224821788e-09, + "loss": 0.3356, + "step": 7085 + }, + { + "epoch": 4.914008321775312, + "grad_norm": 0.4621612958587089, + "learning_rate": 9.007352067463837e-09, + "loss": 0.3511, + "step": 7086 + }, + { + "epoch": 4.914701803051318, + "grad_norm": 0.40015128452525367, + "learning_rate": 8.862700762635734e-09, + "loss": 0.3559, + "step": 7087 + }, + { + "epoch": 4.915395284327323, + "grad_norm": 0.42947360125160833, + "learning_rate": 8.71921934424369e-09, + "loss": 0.3678, + "step": 7088 + }, + { + "epoch": 4.9160887656033285, + "grad_norm": 0.39992195945735676, + "learning_rate": 8.57690784591747e-09, + "loss": 0.3707, + "step": 7089 + }, + { + "epoch": 4.916782246879334, + "grad_norm": 0.3998550897450325, + "learning_rate": 8.435766301014837e-09, + "loss": 0.3518, + "step": 7090 + }, + { + "epoch": 4.91747572815534, + "grad_norm": 0.39413823337501064, + "learning_rate": 8.295794742617658e-09, + "loss": 0.3211, + "step": 7091 + }, + { + "epoch": 4.918169209431346, + "grad_norm": 0.4563999153477434, + "learning_rate": 8.156993203534691e-09, + "loss": 0.3595, + "step": 7092 + }, + { + "epoch": 4.918862690707351, + "grad_norm": 0.3928458532096198, + "learning_rate": 8.019361716299912e-09, + "loss": 0.3937, + "step": 7093 + }, + { + "epoch": 4.9195561719833565, + "grad_norm": 0.4185983600252478, + "learning_rate": 7.88290031317307e-09, + "loss": 0.3549, + "step": 7094 + }, + { + "epoch": 4.920249653259362, + "grad_norm": 0.7822984738244929, + "learning_rate": 7.74760902613969e-09, + "loss": 0.3522, + "step": 7095 + }, + { + "epoch": 4.920943134535367, + "grad_norm": 0.4280494713069808, + "learning_rate": 7.61348788691163e-09, + "loss": 0.3868, + "step": 7096 + }, + { + "epoch": 4.921636615811373, + "grad_norm": 0.5805247856423185, + "learning_rate": 7.480536926925408e-09, + "loss": 0.3219, + "step": 7097 + }, + { + "epoch": 4.922330097087379, + "grad_norm": 0.39361981063657586, + "learning_rate": 7.348756177343319e-09, + "loss": 0.3418, + "step": 7098 + }, + { + "epoch": 4.9230235783633844, + "grad_norm": 0.4323439155185724, + "learning_rate": 7.218145669054544e-09, + "loss": 0.365, + "step": 7099 + }, + { + "epoch": 4.92371705963939, + "grad_norm": 0.6507807845998212, + "learning_rate": 7.088705432672926e-09, + "loss": 0.3992, + "step": 7100 + }, + { + "epoch": 4.924410540915395, + "grad_norm": 0.4162201792567533, + "learning_rate": 6.960435498538642e-09, + "loss": 0.3523, + "step": 7101 + }, + { + "epoch": 4.925104022191401, + "grad_norm": 0.3931946134907611, + "learning_rate": 6.833335896716531e-09, + "loss": 0.4205, + "step": 7102 + }, + { + "epoch": 4.925797503467407, + "grad_norm": 0.42435835286730755, + "learning_rate": 6.707406656998872e-09, + "loss": 0.3542, + "step": 7103 + }, + { + "epoch": 4.9264909847434115, + "grad_norm": 0.38737572198406295, + "learning_rate": 6.5826478089014985e-09, + "loss": 0.3453, + "step": 7104 + }, + { + "epoch": 4.927184466019417, + "grad_norm": 0.4107577928920802, + "learning_rate": 6.4590593816676875e-09, + "loss": 0.3147, + "step": 7105 + }, + { + "epoch": 4.927877947295423, + "grad_norm": 0.403061784628308, + "learning_rate": 6.336641404265376e-09, + "loss": 0.3768, + "step": 7106 + }, + { + "epoch": 4.928571428571429, + "grad_norm": 0.40829423505655965, + "learning_rate": 6.215393905388278e-09, + "loss": 0.3678, + "step": 7107 + }, + { + "epoch": 4.929264909847435, + "grad_norm": 0.4053521142682166, + "learning_rate": 6.09531691345644e-09, + "loss": 0.3432, + "step": 7108 + }, + { + "epoch": 4.9299583911234395, + "grad_norm": 0.3785363134026289, + "learning_rate": 5.976410456614567e-09, + "loss": 0.2789, + "step": 7109 + }, + { + "epoch": 4.930651872399445, + "grad_norm": 0.40055105473222985, + "learning_rate": 5.858674562733701e-09, + "loss": 0.3634, + "step": 7110 + }, + { + "epoch": 4.931345353675451, + "grad_norm": 0.3899632262172668, + "learning_rate": 5.7421092594101004e-09, + "loss": 0.3175, + "step": 7111 + }, + { + "epoch": 4.932038834951456, + "grad_norm": 0.4210211754862228, + "learning_rate": 5.626714573966352e-09, + "loss": 0.3144, + "step": 7112 + }, + { + "epoch": 4.932732316227462, + "grad_norm": 0.40473876869257785, + "learning_rate": 5.51249053344971e-09, + "loss": 0.353, + "step": 7113 + }, + { + "epoch": 4.933425797503467, + "grad_norm": 0.4070079827340145, + "learning_rate": 5.3994371646332035e-09, + "loss": 0.3554, + "step": 7114 + }, + { + "epoch": 4.934119278779473, + "grad_norm": 0.40054824740715883, + "learning_rate": 5.28755449401619e-09, + "loss": 0.3547, + "step": 7115 + }, + { + "epoch": 4.934812760055479, + "grad_norm": 0.3716953117983449, + "learning_rate": 5.176842547823246e-09, + "loss": 0.3348, + "step": 7116 + }, + { + "epoch": 4.935506241331484, + "grad_norm": 0.4842082602057677, + "learning_rate": 5.067301352004173e-09, + "loss": 0.3615, + "step": 7117 + }, + { + "epoch": 4.93619972260749, + "grad_norm": 0.3988125373189556, + "learning_rate": 4.9589309322339855e-09, + "loss": 0.334, + "step": 7118 + }, + { + "epoch": 4.936893203883495, + "grad_norm": 0.4429792862866007, + "learning_rate": 4.851731313915142e-09, + "loss": 0.3452, + "step": 7119 + }, + { + "epoch": 4.9375866851595, + "grad_norm": 0.3837224278088282, + "learning_rate": 4.745702522174211e-09, + "loss": 0.329, + "step": 7120 + }, + { + "epoch": 4.938280166435506, + "grad_norm": 0.47356687930404123, + "learning_rate": 4.64084458186298e-09, + "loss": 0.3284, + "step": 7121 + }, + { + "epoch": 4.938973647711512, + "grad_norm": 0.3972972665988645, + "learning_rate": 4.537157517559565e-09, + "loss": 0.3516, + "step": 7122 + }, + { + "epoch": 4.939667128987518, + "grad_norm": 0.41403779851119105, + "learning_rate": 4.434641353567859e-09, + "loss": 0.376, + "step": 7123 + }, + { + "epoch": 4.940360610263523, + "grad_norm": 0.3930310031152471, + "learning_rate": 4.333296113916419e-09, + "loss": 0.3105, + "step": 7124 + }, + { + "epoch": 4.941054091539528, + "grad_norm": 0.4433083584155557, + "learning_rate": 4.233121822359576e-09, + "loss": 0.3449, + "step": 7125 + }, + { + "epoch": 4.941747572815534, + "grad_norm": 0.3775190964602308, + "learning_rate": 4.134118502378548e-09, + "loss": 0.3548, + "step": 7126 + }, + { + "epoch": 4.94244105409154, + "grad_norm": 0.532387712211218, + "learning_rate": 4.036286177178661e-09, + "loss": 0.354, + "step": 7127 + }, + { + "epoch": 4.943134535367545, + "grad_norm": 0.4585703117502366, + "learning_rate": 3.939624869689907e-09, + "loss": 0.3615, + "step": 7128 + }, + { + "epoch": 4.94382801664355, + "grad_norm": 1.480065738645681, + "learning_rate": 3.844134602570826e-09, + "loss": 0.3546, + "step": 7129 + }, + { + "epoch": 4.944521497919556, + "grad_norm": 0.43030452013479187, + "learning_rate": 3.749815398202405e-09, + "loss": 0.3168, + "step": 7130 + }, + { + "epoch": 4.945214979195562, + "grad_norm": 0.4260708572746793, + "learning_rate": 3.656667278692516e-09, + "loss": 0.3428, + "step": 7131 + }, + { + "epoch": 4.945908460471568, + "grad_norm": 0.4638901817332141, + "learning_rate": 3.5646902658748037e-09, + "loss": 0.3501, + "step": 7132 + }, + { + "epoch": 4.946601941747573, + "grad_norm": 0.4583420023247845, + "learning_rate": 3.4738843813075795e-09, + "loss": 0.3743, + "step": 7133 + }, + { + "epoch": 4.947295423023578, + "grad_norm": 0.3987034843061641, + "learning_rate": 3.3842496462754837e-09, + "loss": 0.3417, + "step": 7134 + }, + { + "epoch": 4.947988904299584, + "grad_norm": 0.40547612685671913, + "learning_rate": 3.295786081788377e-09, + "loss": 0.3861, + "step": 7135 + }, + { + "epoch": 4.948682385575589, + "grad_norm": 0.4363235490099635, + "learning_rate": 3.2084937085807844e-09, + "loss": 0.3693, + "step": 7136 + }, + { + "epoch": 4.949375866851595, + "grad_norm": 0.4457057527680636, + "learning_rate": 3.1223725471135613e-09, + "loss": 0.3442, + "step": 7137 + }, + { + "epoch": 4.950069348127601, + "grad_norm": 0.37605432501494873, + "learning_rate": 3.037422617573893e-09, + "loss": 0.3539, + "step": 7138 + }, + { + "epoch": 4.950762829403606, + "grad_norm": 0.4138186809228635, + "learning_rate": 2.953643939871964e-09, + "loss": 0.3365, + "step": 7139 + }, + { + "epoch": 4.951456310679612, + "grad_norm": 0.37527648900238364, + "learning_rate": 2.8710365336459546e-09, + "loss": 0.3537, + "step": 7140 + }, + { + "epoch": 4.952149791955617, + "grad_norm": 0.40656410976225216, + "learning_rate": 2.789600418258154e-09, + "loss": 0.3487, + "step": 7141 + }, + { + "epoch": 4.952843273231623, + "grad_norm": 0.5160649085711888, + "learning_rate": 2.7093356127960712e-09, + "loss": 0.3644, + "step": 7142 + }, + { + "epoch": 4.9535367545076285, + "grad_norm": 0.42775218125201714, + "learning_rate": 2.6302421360741014e-09, + "loss": 0.3152, + "step": 7143 + }, + { + "epoch": 4.954230235783633, + "grad_norm": 0.4125229433260301, + "learning_rate": 2.5523200066301935e-09, + "loss": 0.3299, + "step": 7144 + }, + { + "epoch": 4.954923717059639, + "grad_norm": 0.418855765509575, + "learning_rate": 2.475569242729736e-09, + "loss": 0.4071, + "step": 7145 + }, + { + "epoch": 4.955617198335645, + "grad_norm": 0.43067685886835083, + "learning_rate": 2.3999898623622288e-09, + "loss": 0.385, + "step": 7146 + }, + { + "epoch": 4.956310679611651, + "grad_norm": 0.40011527404403346, + "learning_rate": 2.3255818832423894e-09, + "loss": 0.3303, + "step": 7147 + }, + { + "epoch": 4.9570041608876565, + "grad_norm": 0.42203301734339205, + "learning_rate": 2.252345322811267e-09, + "loss": 0.3437, + "step": 7148 + }, + { + "epoch": 4.957697642163661, + "grad_norm": 0.3737632311503237, + "learning_rate": 2.1802801982351294e-09, + "loss": 0.3006, + "step": 7149 + }, + { + "epoch": 4.958391123439667, + "grad_norm": 0.394844019404785, + "learning_rate": 2.109386526405466e-09, + "loss": 0.3175, + "step": 7150 + }, + { + "epoch": 4.959084604715673, + "grad_norm": 0.4109833593874536, + "learning_rate": 2.0396643239389834e-09, + "loss": 0.3691, + "step": 7151 + }, + { + "epoch": 4.959778085991678, + "grad_norm": 0.41160675767056626, + "learning_rate": 1.9711136071787206e-09, + "loss": 0.3755, + "step": 7152 + }, + { + "epoch": 4.960471567267684, + "grad_norm": 0.3713147211785371, + "learning_rate": 1.90373439219127e-09, + "loss": 0.3131, + "step": 7153 + }, + { + "epoch": 4.961165048543689, + "grad_norm": 0.675322868087702, + "learning_rate": 1.8375266947712188e-09, + "loss": 0.3772, + "step": 7154 + }, + { + "epoch": 4.961858529819695, + "grad_norm": 0.44718262080735793, + "learning_rate": 1.772490530436155e-09, + "loss": 0.3534, + "step": 7155 + }, + { + "epoch": 4.962552011095701, + "grad_norm": 0.38804943209509857, + "learning_rate": 1.7086259144305507e-09, + "loss": 0.3518, + "step": 7156 + }, + { + "epoch": 4.963245492371706, + "grad_norm": 0.43518653897743365, + "learning_rate": 1.6459328617240978e-09, + "loss": 0.378, + "step": 7157 + }, + { + "epoch": 4.9639389736477115, + "grad_norm": 0.4692201853941706, + "learning_rate": 1.5844113870105981e-09, + "loss": 0.3548, + "step": 7158 + }, + { + "epoch": 4.964632454923717, + "grad_norm": 0.38375468242062444, + "learning_rate": 1.524061504711294e-09, + "loss": 0.3442, + "step": 7159 + }, + { + "epoch": 4.965325936199722, + "grad_norm": 0.4532502337646082, + "learning_rate": 1.4648832289709812e-09, + "loss": 0.3792, + "step": 7160 + }, + { + "epoch": 4.966019417475728, + "grad_norm": 0.3943191174588462, + "learning_rate": 1.406876573660787e-09, + "loss": 0.3334, + "step": 7161 + }, + { + "epoch": 4.966712898751734, + "grad_norm": 0.45066506419530183, + "learning_rate": 1.3500415523776123e-09, + "loss": 0.3885, + "step": 7162 + }, + { + "epoch": 4.9674063800277395, + "grad_norm": 0.4421344145332074, + "learning_rate": 1.2943781784424681e-09, + "loss": 0.3752, + "step": 7163 + }, + { + "epoch": 4.968099861303745, + "grad_norm": 0.3944046743074427, + "learning_rate": 1.2398864649032505e-09, + "loss": 0.363, + "step": 7164 + }, + { + "epoch": 4.96879334257975, + "grad_norm": 0.4313886380543036, + "learning_rate": 1.1865664245314101e-09, + "loss": 0.3576, + "step": 7165 + }, + { + "epoch": 4.969486823855756, + "grad_norm": 0.819869336042323, + "learning_rate": 1.1344180698258377e-09, + "loss": 0.3821, + "step": 7166 + }, + { + "epoch": 4.970180305131762, + "grad_norm": 0.45643068110033763, + "learning_rate": 1.0834414130084236e-09, + "loss": 0.3579, + "step": 7167 + }, + { + "epoch": 4.970873786407767, + "grad_norm": 2.8618563914122612, + "learning_rate": 1.0336364660290532e-09, + "loss": 0.3536, + "step": 7168 + }, + { + "epoch": 4.971567267683772, + "grad_norm": 0.386465653944153, + "learning_rate": 9.850032405611665e-10, + "loss": 0.3451, + "step": 7169 + }, + { + "epoch": 4.972260748959778, + "grad_norm": 0.3893868756867282, + "learning_rate": 9.375417480034232e-10, + "loss": 0.351, + "step": 7170 + }, + { + "epoch": 4.972954230235784, + "grad_norm": 0.4078707652612696, + "learning_rate": 8.912519994813684e-10, + "loss": 0.3475, + "step": 7171 + }, + { + "epoch": 4.97364771151179, + "grad_norm": 0.40150829492419765, + "learning_rate": 8.461340058446566e-10, + "loss": 0.3925, + "step": 7172 + }, + { + "epoch": 4.9743411927877945, + "grad_norm": 0.39618053064184006, + "learning_rate": 8.02187777668162e-10, + "loss": 0.3024, + "step": 7173 + }, + { + "epoch": 4.9750346740638, + "grad_norm": 0.4162817270365035, + "learning_rate": 7.594133252530888e-10, + "loss": 0.3629, + "step": 7174 + }, + { + "epoch": 4.975728155339806, + "grad_norm": 0.4112483806849657, + "learning_rate": 7.178106586258615e-10, + "loss": 0.3581, + "step": 7175 + }, + { + "epoch": 4.976421636615811, + "grad_norm": 0.4093664270423176, + "learning_rate": 6.773797875364585e-10, + "loss": 0.4025, + "step": 7176 + }, + { + "epoch": 4.977115117891817, + "grad_norm": 0.6649449873759345, + "learning_rate": 6.381207214628538e-10, + "loss": 0.403, + "step": 7177 + }, + { + "epoch": 4.9778085991678225, + "grad_norm": 0.3949098028085531, + "learning_rate": 6.000334696071309e-10, + "loss": 0.3397, + "step": 7178 + }, + { + "epoch": 4.978502080443828, + "grad_norm": 0.40806486102133166, + "learning_rate": 5.631180408954829e-10, + "loss": 0.3588, + "step": 7179 + }, + { + "epoch": 4.979195561719834, + "grad_norm": 0.4325669155327499, + "learning_rate": 5.273744439815431e-10, + "loss": 0.346, + "step": 7180 + }, + { + "epoch": 4.979889042995839, + "grad_norm": 0.4185228147340133, + "learning_rate": 4.928026872436098e-10, + "loss": 0.3812, + "step": 7181 + }, + { + "epoch": 4.980582524271845, + "grad_norm": 0.4213328422111207, + "learning_rate": 4.5940277878409047e-10, + "loss": 0.3798, + "step": 7182 + }, + { + "epoch": 4.98127600554785, + "grad_norm": 0.482819301252033, + "learning_rate": 4.2717472643227785e-10, + "loss": 0.3791, + "step": 7183 + }, + { + "epoch": 4.981969486823855, + "grad_norm": 0.404326783039016, + "learning_rate": 3.961185377421295e-10, + "loss": 0.3556, + "step": 7184 + }, + { + "epoch": 4.982662968099861, + "grad_norm": 0.41607815316935426, + "learning_rate": 3.662342199933777e-10, + "loss": 0.3772, + "step": 7185 + }, + { + "epoch": 4.983356449375867, + "grad_norm": 0.3772116345648475, + "learning_rate": 3.375217801898645e-10, + "loss": 0.2853, + "step": 7186 + }, + { + "epoch": 4.984049930651873, + "grad_norm": 0.4027984181463469, + "learning_rate": 3.099812250617618e-10, + "loss": 0.3264, + "step": 7187 + }, + { + "epoch": 4.984743411927878, + "grad_norm": 0.4693428818263529, + "learning_rate": 2.8361256106501644e-10, + "loss": 0.378, + "step": 7188 + }, + { + "epoch": 4.985436893203883, + "grad_norm": 0.4115243332211786, + "learning_rate": 2.58415794379685e-10, + "loss": 0.3393, + "step": 7189 + }, + { + "epoch": 4.986130374479889, + "grad_norm": 0.41852247502798956, + "learning_rate": 2.343909309115988e-10, + "loss": 0.3628, + "step": 7190 + }, + { + "epoch": 4.986823855755895, + "grad_norm": 0.4213731088509524, + "learning_rate": 2.1153797629291928e-10, + "loss": 0.3309, + "step": 7191 + }, + { + "epoch": 4.9875173370319, + "grad_norm": 0.39253205449439826, + "learning_rate": 1.8985693587880715e-10, + "loss": 0.3565, + "step": 7192 + }, + { + "epoch": 4.9882108183079055, + "grad_norm": 0.4284526466840562, + "learning_rate": 1.6934781475241856e-10, + "loss": 0.3823, + "step": 7193 + }, + { + "epoch": 4.988904299583911, + "grad_norm": 0.41975107589138805, + "learning_rate": 1.500106177204641e-10, + "loss": 0.3774, + "step": 7194 + }, + { + "epoch": 4.989597780859917, + "grad_norm": 0.37972385363613254, + "learning_rate": 1.3184534931487414e-10, + "loss": 0.3468, + "step": 7195 + }, + { + "epoch": 4.990291262135923, + "grad_norm": 0.4092809448296632, + "learning_rate": 1.148520137944642e-10, + "loss": 0.3242, + "step": 7196 + }, + { + "epoch": 4.990984743411928, + "grad_norm": 0.4228224967768626, + "learning_rate": 9.903061514160428e-11, + "loss": 0.3583, + "step": 7197 + }, + { + "epoch": 4.991678224687933, + "grad_norm": 0.40379280414258806, + "learning_rate": 8.43811570655495e-11, + "loss": 0.3662, + "step": 7198 + }, + { + "epoch": 4.992371705963939, + "grad_norm": 0.4440129635914101, + "learning_rate": 7.090364299910946e-11, + "loss": 0.3607, + "step": 7199 + }, + { + "epoch": 4.993065187239944, + "grad_norm": 0.3868024627311072, + "learning_rate": 5.859807610142377e-11, + "loss": 0.3622, + "step": 7200 + }, + { + "epoch": 4.99375866851595, + "grad_norm": 0.43580068144591233, + "learning_rate": 4.746445925740695e-11, + "loss": 0.3606, + "step": 7201 + }, + { + "epoch": 4.994452149791956, + "grad_norm": 0.43794296060757415, + "learning_rate": 3.750279507608312e-11, + "loss": 0.3923, + "step": 7202 + }, + { + "epoch": 4.995145631067961, + "grad_norm": 0.4056518962016014, + "learning_rate": 2.8713085892806415e-11, + "loss": 0.4026, + "step": 7203 + }, + { + "epoch": 4.995839112343967, + "grad_norm": 0.36567630084588015, + "learning_rate": 2.109533376759565e-11, + "loss": 0.3209, + "step": 7204 + }, + { + "epoch": 4.996532593619972, + "grad_norm": 0.462277374759319, + "learning_rate": 1.4649540486244564e-11, + "loss": 0.3666, + "step": 7205 + }, + { + "epoch": 4.997226074895978, + "grad_norm": 0.4190622269402481, + "learning_rate": 9.375707559211578e-12, + "loss": 0.3559, + "step": 7206 + }, + { + "epoch": 4.997919556171984, + "grad_norm": 0.4448874348254667, + "learning_rate": 5.273836223285145e-12, + "loss": 0.3286, + "step": 7207 + }, + { + "epoch": 4.9986130374479885, + "grad_norm": 0.4704150656196769, + "learning_rate": 2.3439274393632916e-12, + "loss": 0.398, + "step": 7208 + }, + { + "epoch": 4.999306518723994, + "grad_norm": 0.45786451727663974, + "learning_rate": 5.859818941189588e-13, + "loss": 0.3508, + "step": 7209 + }, + { + "epoch": 5.0, + "grad_norm": 0.39151771220459436, + "learning_rate": 0.0, + "loss": 0.3327, + "step": 7210 + }, + { + "epoch": 5.0, + "step": 7210, + "total_flos": 708732517220352.0, + "train_loss": 0.4619506940348965, + "train_runtime": 68592.6905, + "train_samples_per_second": 1.681, + "train_steps_per_second": 0.105 + } + ], + "logging_steps": 1, + "max_steps": 7210, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 10, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 708732517220352.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}