| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.96398891966759, | |
| "eval_steps": 500, | |
| "global_step": 180, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01662049861495845, | |
| "grad_norm": 1.9652302265167236, | |
| "learning_rate": 0.0, | |
| "loss": 0.7152, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0332409972299169, | |
| "grad_norm": 2.135629177093506, | |
| "learning_rate": 5.555555555555555e-07, | |
| "loss": 0.7024, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.04986149584487535, | |
| "grad_norm": 2.365844964981079, | |
| "learning_rate": 1.111111111111111e-06, | |
| "loss": 0.7755, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0664819944598338, | |
| "grad_norm": 1.939900517463684, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 0.7134, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.08310249307479224, | |
| "grad_norm": 1.8507870435714722, | |
| "learning_rate": 2.222222222222222e-06, | |
| "loss": 0.6644, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0997229916897507, | |
| "grad_norm": 1.8390847444534302, | |
| "learning_rate": 2.7777777777777783e-06, | |
| "loss": 0.7306, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.11634349030470914, | |
| "grad_norm": 1.2149966955184937, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.5377, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.1329639889196676, | |
| "grad_norm": 1.203329086303711, | |
| "learning_rate": 3.88888888888889e-06, | |
| "loss": 0.6448, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.14958448753462603, | |
| "grad_norm": 1.1259090900421143, | |
| "learning_rate": 4.444444444444444e-06, | |
| "loss": 0.6041, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.16620498614958448, | |
| "grad_norm": 0.9785488247871399, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6802, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.18282548476454294, | |
| "grad_norm": 0.7702904343605042, | |
| "learning_rate": 5.555555555555557e-06, | |
| "loss": 0.5737, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.1994459833795014, | |
| "grad_norm": 0.7972448468208313, | |
| "learning_rate": 6.111111111111112e-06, | |
| "loss": 0.6071, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.21606648199445982, | |
| "grad_norm": 0.8643639087677002, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.5645, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.23268698060941828, | |
| "grad_norm": 0.822340190410614, | |
| "learning_rate": 7.222222222222223e-06, | |
| "loss": 0.5512, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.24930747922437674, | |
| "grad_norm": 1.0604660511016846, | |
| "learning_rate": 7.77777777777778e-06, | |
| "loss": 0.5875, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.2659279778393352, | |
| "grad_norm": 0.8126739263534546, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 0.5601, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.28254847645429365, | |
| "grad_norm": 0.7240079641342163, | |
| "learning_rate": 8.888888888888888e-06, | |
| "loss": 0.5724, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.29916897506925205, | |
| "grad_norm": 0.6566236615180969, | |
| "learning_rate": 9.444444444444445e-06, | |
| "loss": 0.5535, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.3157894736842105, | |
| "grad_norm": 0.7229272723197937, | |
| "learning_rate": 1e-05, | |
| "loss": 0.5413, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.33240997229916897, | |
| "grad_norm": 0.6160261034965515, | |
| "learning_rate": 9.999059852242508e-06, | |
| "loss": 0.4809, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.3490304709141274, | |
| "grad_norm": 0.5426657199859619, | |
| "learning_rate": 9.996239762521152e-06, | |
| "loss": 0.4453, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.3656509695290859, | |
| "grad_norm": 0.6986624002456665, | |
| "learning_rate": 9.991540791356342e-06, | |
| "loss": 0.5704, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.38227146814404434, | |
| "grad_norm": 0.6466948986053467, | |
| "learning_rate": 9.98496470583896e-06, | |
| "loss": 0.5222, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.3988919667590028, | |
| "grad_norm": 0.5881003141403198, | |
| "learning_rate": 9.976513978965829e-06, | |
| "loss": 0.4903, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.4155124653739612, | |
| "grad_norm": 0.5835773348808289, | |
| "learning_rate": 9.966191788709716e-06, | |
| "loss": 0.4936, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.43213296398891965, | |
| "grad_norm": 0.5974717736244202, | |
| "learning_rate": 9.954002016824226e-06, | |
| "loss": 0.544, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.4487534626038781, | |
| "grad_norm": 0.6126233339309692, | |
| "learning_rate": 9.939949247384046e-06, | |
| "loss": 0.5313, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.46537396121883656, | |
| "grad_norm": 0.5605891942977905, | |
| "learning_rate": 9.924038765061042e-06, | |
| "loss": 0.5121, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.481994459833795, | |
| "grad_norm": 0.523395299911499, | |
| "learning_rate": 9.906276553136924e-06, | |
| "loss": 0.4705, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.4986149584487535, | |
| "grad_norm": 0.5597982406616211, | |
| "learning_rate": 9.886669291253178e-06, | |
| "loss": 0.4951, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.5152354570637119, | |
| "grad_norm": 0.5273374915122986, | |
| "learning_rate": 9.86522435289912e-06, | |
| "loss": 0.4763, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.5318559556786704, | |
| "grad_norm": 0.5255304574966431, | |
| "learning_rate": 9.841949802639031e-06, | |
| "loss": 0.5133, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.5484764542936288, | |
| "grad_norm": 0.8223831057548523, | |
| "learning_rate": 9.816854393079402e-06, | |
| "loss": 0.4865, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.5650969529085873, | |
| "grad_norm": 0.4619203805923462, | |
| "learning_rate": 9.789947561577445e-06, | |
| "loss": 0.4631, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.5817174515235457, | |
| "grad_norm": 0.4974648654460907, | |
| "learning_rate": 9.761239426692077e-06, | |
| "loss": 0.5039, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.5983379501385041, | |
| "grad_norm": 0.5178198218345642, | |
| "learning_rate": 9.730740784378755e-06, | |
| "loss": 0.4618, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.6149584487534626, | |
| "grad_norm": 0.5592218637466431, | |
| "learning_rate": 9.698463103929542e-06, | |
| "loss": 0.4777, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.631578947368421, | |
| "grad_norm": 0.4956098198890686, | |
| "learning_rate": 9.664418523660004e-06, | |
| "loss": 0.4925, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.6481994459833795, | |
| "grad_norm": 0.48805150389671326, | |
| "learning_rate": 9.628619846344453e-06, | |
| "loss": 0.4423, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.6648199445983379, | |
| "grad_norm": 0.5749639868736267, | |
| "learning_rate": 9.591080534401371e-06, | |
| "loss": 0.55, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.6814404432132964, | |
| "grad_norm": 0.7393980622291565, | |
| "learning_rate": 9.551814704830734e-06, | |
| "loss": 0.426, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.6980609418282548, | |
| "grad_norm": 0.5011327862739563, | |
| "learning_rate": 9.51083712390519e-06, | |
| "loss": 0.4628, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.7146814404432132, | |
| "grad_norm": 0.572926938533783, | |
| "learning_rate": 9.468163201617063e-06, | |
| "loss": 0.527, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.7313019390581718, | |
| "grad_norm": 0.5243227481842041, | |
| "learning_rate": 9.423808985883289e-06, | |
| "loss": 0.5115, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.7479224376731302, | |
| "grad_norm": 0.5271593928337097, | |
| "learning_rate": 9.377791156510456e-06, | |
| "loss": 0.4921, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.7645429362880887, | |
| "grad_norm": 0.5143831968307495, | |
| "learning_rate": 9.330127018922195e-06, | |
| "loss": 0.4842, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.7811634349030471, | |
| "grad_norm": 0.5135733485221863, | |
| "learning_rate": 9.280834497651334e-06, | |
| "loss": 0.4939, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.7977839335180056, | |
| "grad_norm": 0.5173041820526123, | |
| "learning_rate": 9.229932129599206e-06, | |
| "loss": 0.4819, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.814404432132964, | |
| "grad_norm": 0.570851743221283, | |
| "learning_rate": 9.177439057064684e-06, | |
| "loss": 0.5439, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.8310249307479224, | |
| "grad_norm": 0.552671492099762, | |
| "learning_rate": 9.123375020545534e-06, | |
| "loss": 0.4669, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.8476454293628809, | |
| "grad_norm": 0.5668032765388489, | |
| "learning_rate": 9.067760351314838e-06, | |
| "loss": 0.5138, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.8642659279778393, | |
| "grad_norm": 0.48532989621162415, | |
| "learning_rate": 9.01061596377522e-06, | |
| "loss": 0.4827, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.8808864265927978, | |
| "grad_norm": 0.4953126311302185, | |
| "learning_rate": 8.951963347593797e-06, | |
| "loss": 0.4273, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.8975069252077562, | |
| "grad_norm": 0.5042351484298706, | |
| "learning_rate": 8.891824559620801e-06, | |
| "loss": 0.5311, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.9141274238227147, | |
| "grad_norm": 0.532244086265564, | |
| "learning_rate": 8.83022221559489e-06, | |
| "loss": 0.5364, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.9307479224376731, | |
| "grad_norm": 0.5507211089134216, | |
| "learning_rate": 8.767179481638303e-06, | |
| "loss": 0.5264, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.9473684210526315, | |
| "grad_norm": 0.5117627382278442, | |
| "learning_rate": 8.702720065545024e-06, | |
| "loss": 0.4994, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.96398891966759, | |
| "grad_norm": 0.6424684524536133, | |
| "learning_rate": 8.636868207865244e-06, | |
| "loss": 0.5321, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.9806094182825484, | |
| "grad_norm": 0.5632804036140442, | |
| "learning_rate": 8.569648672789496e-06, | |
| "loss": 0.5354, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.997229916897507, | |
| "grad_norm": 0.5519580841064453, | |
| "learning_rate": 8.501086738835843e-06, | |
| "loss": 0.5502, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.5519580841064453, | |
| "learning_rate": 8.43120818934367e-06, | |
| "loss": 0.4298, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 1.0166204986149585, | |
| "grad_norm": 1.4024403095245361, | |
| "learning_rate": 8.360039302777614e-06, | |
| "loss": 0.3848, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 1.0332409972299168, | |
| "grad_norm": 0.4745033085346222, | |
| "learning_rate": 8.28760684284532e-06, | |
| "loss": 0.4, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 1.0498614958448753, | |
| "grad_norm": 0.5079669952392578, | |
| "learning_rate": 8.213938048432697e-06, | |
| "loss": 0.3824, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 1.0664819944598338, | |
| "grad_norm": 0.49697190523147583, | |
| "learning_rate": 8.139060623360494e-06, | |
| "loss": 0.4243, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 1.0831024930747923, | |
| "grad_norm": 0.4616394639015198, | |
| "learning_rate": 8.063002725966014e-06, | |
| "loss": 0.3888, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 1.0997229916897506, | |
| "grad_norm": 0.4260391294956207, | |
| "learning_rate": 7.985792958513932e-06, | |
| "loss": 0.3406, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 1.1163434903047091, | |
| "grad_norm": 0.47153493762016296, | |
| "learning_rate": 7.907460356440133e-06, | |
| "loss": 0.3636, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 1.1329639889196677, | |
| "grad_norm": 0.5076174139976501, | |
| "learning_rate": 7.828034377432694e-06, | |
| "loss": 0.4166, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 1.149584487534626, | |
| "grad_norm": 0.5310080647468567, | |
| "learning_rate": 7.747544890354031e-06, | |
| "loss": 0.4311, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.1662049861495845, | |
| "grad_norm": 0.5010002851486206, | |
| "learning_rate": 7.666022164008458e-06, | |
| "loss": 0.3193, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 1.182825484764543, | |
| "grad_norm": 0.49259936809539795, | |
| "learning_rate": 7.5834968557593155e-06, | |
| "loss": 0.3456, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 1.1994459833795015, | |
| "grad_norm": 0.5213885307312012, | |
| "learning_rate": 7.500000000000001e-06, | |
| "loss": 0.3615, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 1.2160664819944598, | |
| "grad_norm": 0.512752115726471, | |
| "learning_rate": 7.415562996483193e-06, | |
| "loss": 0.3569, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 1.2326869806094183, | |
| "grad_norm": 0.5139035582542419, | |
| "learning_rate": 7.330217598512696e-06, | |
| "loss": 0.3859, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.2493074792243768, | |
| "grad_norm": 0.5561084151268005, | |
| "learning_rate": 7.243995901002312e-06, | |
| "loss": 0.363, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 1.2659279778393353, | |
| "grad_norm": 0.49844229221343994, | |
| "learning_rate": 7.156930328406268e-06, | |
| "loss": 0.3648, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 1.2825484764542936, | |
| "grad_norm": 0.5111745595932007, | |
| "learning_rate": 7.069053622525697e-06, | |
| "loss": 0.3453, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 1.299168975069252, | |
| "grad_norm": 0.5968831777572632, | |
| "learning_rate": 6.980398830195785e-06, | |
| "loss": 0.3601, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 1.3157894736842106, | |
| "grad_norm": 0.3998188376426697, | |
| "learning_rate": 6.890999290858213e-06, | |
| "loss": 0.2965, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.332409972299169, | |
| "grad_norm": 0.5044348239898682, | |
| "learning_rate": 6.800888624023552e-06, | |
| "loss": 0.3579, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 1.3490304709141274, | |
| "grad_norm": 0.499636709690094, | |
| "learning_rate": 6.710100716628345e-06, | |
| "loss": 0.3751, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 1.365650969529086, | |
| "grad_norm": 0.5045871734619141, | |
| "learning_rate": 6.618669710291607e-06, | |
| "loss": 0.3782, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 1.3822714681440442, | |
| "grad_norm": 0.5296726822853088, | |
| "learning_rate": 6.526629988475567e-06, | |
| "loss": 0.413, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 1.3988919667590027, | |
| "grad_norm": 0.5541542768478394, | |
| "learning_rate": 6.434016163555452e-06, | |
| "loss": 0.4176, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.4155124653739612, | |
| "grad_norm": 0.52264803647995, | |
| "learning_rate": 6.340863063803187e-06, | |
| "loss": 0.3687, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 1.4321329639889195, | |
| "grad_norm": 0.5726013779640198, | |
| "learning_rate": 6.247205720289907e-06, | |
| "loss": 0.4127, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 1.448753462603878, | |
| "grad_norm": 0.5129911303520203, | |
| "learning_rate": 6.153079353712201e-06, | |
| "loss": 0.3608, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 1.4653739612188366, | |
| "grad_norm": 0.5869404673576355, | |
| "learning_rate": 6.058519361147055e-06, | |
| "loss": 0.369, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 1.481994459833795, | |
| "grad_norm": 0.4603992998600006, | |
| "learning_rate": 5.9635613027404495e-06, | |
| "loss": 0.2792, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.4986149584487536, | |
| "grad_norm": 0.433829128742218, | |
| "learning_rate": 5.8682408883346535e-06, | |
| "loss": 0.2935, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 1.5152354570637119, | |
| "grad_norm": 0.4892548620700836, | |
| "learning_rate": 5.772593964039203e-06, | |
| "loss": 0.3591, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 1.5318559556786704, | |
| "grad_norm": 0.4414325952529907, | |
| "learning_rate": 5.6766564987506564e-06, | |
| "loss": 0.3312, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 1.548476454293629, | |
| "grad_norm": 0.5104185938835144, | |
| "learning_rate": 5.5804645706261515e-06, | |
| "loss": 0.3524, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 1.5650969529085872, | |
| "grad_norm": 0.46491438150405884, | |
| "learning_rate": 5.484054353515896e-06, | |
| "loss": 0.3127, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.5817174515235457, | |
| "grad_norm": 0.5037529468536377, | |
| "learning_rate": 5.387462103359655e-06, | |
| "loss": 0.3549, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 1.5983379501385042, | |
| "grad_norm": 0.456927090883255, | |
| "learning_rate": 5.290724144552379e-06, | |
| "loss": 0.3583, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 1.6149584487534625, | |
| "grad_norm": 0.48146891593933105, | |
| "learning_rate": 5.193876856284085e-06, | |
| "loss": 0.3485, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 1.631578947368421, | |
| "grad_norm": 0.45695117115974426, | |
| "learning_rate": 5.096956658859122e-06, | |
| "loss": 0.3325, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 1.6481994459833795, | |
| "grad_norm": 0.46289077401161194, | |
| "learning_rate": 5e-06, | |
| "loss": 0.3461, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.6648199445983378, | |
| "grad_norm": 0.5340746641159058, | |
| "learning_rate": 4.903043341140879e-06, | |
| "loss": 0.3856, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 1.6814404432132966, | |
| "grad_norm": 0.433956503868103, | |
| "learning_rate": 4.806123143715916e-06, | |
| "loss": 0.3166, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.6980609418282548, | |
| "grad_norm": 0.4446304440498352, | |
| "learning_rate": 4.7092758554476215e-06, | |
| "loss": 0.3378, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 1.7146814404432131, | |
| "grad_norm": 0.5027093291282654, | |
| "learning_rate": 4.6125378966403465e-06, | |
| "loss": 0.3915, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.7313019390581719, | |
| "grad_norm": 0.5546647310256958, | |
| "learning_rate": 4.515945646484105e-06, | |
| "loss": 0.3484, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.7479224376731302, | |
| "grad_norm": 0.49674123525619507, | |
| "learning_rate": 4.4195354293738484e-06, | |
| "loss": 0.3501, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 1.7645429362880887, | |
| "grad_norm": 0.5134773850440979, | |
| "learning_rate": 4.323343501249346e-06, | |
| "loss": 0.3818, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 1.7811634349030472, | |
| "grad_norm": 0.5111790299415588, | |
| "learning_rate": 4.227406035960798e-06, | |
| "loss": 0.4027, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.7977839335180055, | |
| "grad_norm": 0.5103554129600525, | |
| "learning_rate": 4.131759111665349e-06, | |
| "loss": 0.3295, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.814404432132964, | |
| "grad_norm": 0.48488280177116394, | |
| "learning_rate": 4.036438697259551e-06, | |
| "loss": 0.3339, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.8310249307479225, | |
| "grad_norm": 0.4840296506881714, | |
| "learning_rate": 3.941480638852948e-06, | |
| "loss": 0.3519, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.8476454293628808, | |
| "grad_norm": 0.4919949471950531, | |
| "learning_rate": 3.8469206462878e-06, | |
| "loss": 0.328, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.8642659279778393, | |
| "grad_norm": 0.5291365385055542, | |
| "learning_rate": 3.752794279710094e-06, | |
| "loss": 0.3753, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 1.8808864265927978, | |
| "grad_norm": 0.4807715117931366, | |
| "learning_rate": 3.6591369361968127e-06, | |
| "loss": 0.393, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 1.897506925207756, | |
| "grad_norm": 0.4700012803077698, | |
| "learning_rate": 3.5659838364445505e-06, | |
| "loss": 0.3182, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.9141274238227148, | |
| "grad_norm": 1.0692706108093262, | |
| "learning_rate": 3.473370011524435e-06, | |
| "loss": 0.3463, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 1.9307479224376731, | |
| "grad_norm": 0.49183958768844604, | |
| "learning_rate": 3.3813302897083955e-06, | |
| "loss": 0.3694, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 1.9473684210526314, | |
| "grad_norm": 0.5577133893966675, | |
| "learning_rate": 3.289899283371657e-06, | |
| "loss": 0.3693, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 1.9639889196675901, | |
| "grad_norm": 0.47118237614631653, | |
| "learning_rate": 3.1991113759764493e-06, | |
| "loss": 0.3325, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 1.9806094182825484, | |
| "grad_norm": 0.44954901933670044, | |
| "learning_rate": 3.1090007091417884e-06, | |
| "loss": 0.3497, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.997229916897507, | |
| "grad_norm": 0.5316449403762817, | |
| "learning_rate": 3.019601169804216e-06, | |
| "loss": 0.4239, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.5316449403762817, | |
| "learning_rate": 2.9309463774743047e-06, | |
| "loss": 0.302, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 2.0166204986149583, | |
| "grad_norm": 1.3086326122283936, | |
| "learning_rate": 2.843069671593734e-06, | |
| "loss": 0.2255, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 2.033240997229917, | |
| "grad_norm": 0.4746488928794861, | |
| "learning_rate": 2.7560040989976894e-06, | |
| "loss": 0.2275, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 2.0498614958448753, | |
| "grad_norm": 0.4944143295288086, | |
| "learning_rate": 2.6697824014873076e-06, | |
| "loss": 0.2648, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 2.0664819944598336, | |
| "grad_norm": 0.5195774435997009, | |
| "learning_rate": 2.5844370035168077e-06, | |
| "loss": 0.2707, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 2.0831024930747923, | |
| "grad_norm": 0.885553240776062, | |
| "learning_rate": 2.5000000000000015e-06, | |
| "loss": 0.2764, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 2.0997229916897506, | |
| "grad_norm": 0.5028234124183655, | |
| "learning_rate": 2.4165031442406857e-06, | |
| "loss": 0.2503, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 2.1163434903047094, | |
| "grad_norm": 0.4780957102775574, | |
| "learning_rate": 2.333977835991545e-06, | |
| "loss": 0.2406, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 2.1329639889196677, | |
| "grad_norm": 0.46052825450897217, | |
| "learning_rate": 2.2524551096459703e-06, | |
| "loss": 0.2155, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.149584487534626, | |
| "grad_norm": 0.6180452704429626, | |
| "learning_rate": 2.171965622567308e-06, | |
| "loss": 0.2787, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 2.1662049861495847, | |
| "grad_norm": 0.6939100027084351, | |
| "learning_rate": 2.0925396435598665e-06, | |
| "loss": 0.246, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 2.182825484764543, | |
| "grad_norm": 0.6042692065238953, | |
| "learning_rate": 2.0142070414860704e-06, | |
| "loss": 0.2609, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 2.1994459833795013, | |
| "grad_norm": 0.7851183414459229, | |
| "learning_rate": 1.936997274033986e-06, | |
| "loss": 0.2876, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 2.21606648199446, | |
| "grad_norm": 0.5801565051078796, | |
| "learning_rate": 1.8609393766395083e-06, | |
| "loss": 0.288, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 2.2326869806094183, | |
| "grad_norm": 0.5398533940315247, | |
| "learning_rate": 1.7860619515673034e-06, | |
| "loss": 0.2958, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 2.2493074792243766, | |
| "grad_norm": 0.48142921924591064, | |
| "learning_rate": 1.7123931571546826e-06, | |
| "loss": 0.2506, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 2.2659279778393353, | |
| "grad_norm": 0.48484477400779724, | |
| "learning_rate": 1.639960697222388e-06, | |
| "loss": 0.2166, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 2.2825484764542936, | |
| "grad_norm": 0.4676513075828552, | |
| "learning_rate": 1.5687918106563326e-06, | |
| "loss": 0.2558, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 2.299168975069252, | |
| "grad_norm": 0.5008206963539124, | |
| "learning_rate": 1.4989132611641576e-06, | |
| "loss": 0.2315, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.3157894736842106, | |
| "grad_norm": 0.5055615901947021, | |
| "learning_rate": 1.4303513272105057e-06, | |
| "loss": 0.278, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 2.332409972299169, | |
| "grad_norm": 0.5048314332962036, | |
| "learning_rate": 1.3631317921347564e-06, | |
| "loss": 0.2469, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 2.349030470914127, | |
| "grad_norm": 0.4561052620410919, | |
| "learning_rate": 1.297279934454978e-06, | |
| "loss": 0.2363, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 2.365650969529086, | |
| "grad_norm": 0.4409971237182617, | |
| "learning_rate": 1.2328205183616964e-06, | |
| "loss": 0.2582, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 2.3822714681440442, | |
| "grad_norm": 0.5186073780059814, | |
| "learning_rate": 1.1697777844051105e-06, | |
| "loss": 0.2354, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 2.398891966759003, | |
| "grad_norm": 0.4931983947753906, | |
| "learning_rate": 1.1081754403792e-06, | |
| "loss": 0.2628, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 2.4155124653739612, | |
| "grad_norm": 0.4725812077522278, | |
| "learning_rate": 1.0480366524062041e-06, | |
| "loss": 0.2465, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 2.4321329639889195, | |
| "grad_norm": 0.459830641746521, | |
| "learning_rate": 9.893840362247809e-07, | |
| "loss": 0.2494, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 2.4487534626038783, | |
| "grad_norm": 0.45882484316825867, | |
| "learning_rate": 9.322396486851626e-07, | |
| "loss": 0.2572, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 2.4653739612188366, | |
| "grad_norm": 0.4628044664859772, | |
| "learning_rate": 8.766249794544662e-07, | |
| "loss": 0.2473, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.481994459833795, | |
| "grad_norm": 0.43482884764671326, | |
| "learning_rate": 8.225609429353187e-07, | |
| "loss": 0.2334, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 2.4986149584487536, | |
| "grad_norm": 0.5092786550521851, | |
| "learning_rate": 7.700678704007947e-07, | |
| "loss": 0.2464, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 2.515235457063712, | |
| "grad_norm": 0.5002970695495605, | |
| "learning_rate": 7.191655023486682e-07, | |
| "loss": 0.2386, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 2.5318559556786706, | |
| "grad_norm": 0.44085896015167236, | |
| "learning_rate": 6.698729810778065e-07, | |
| "loss": 0.2231, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 2.548476454293629, | |
| "grad_norm": 0.4750898480415344, | |
| "learning_rate": 6.222088434895462e-07, | |
| "loss": 0.2746, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 2.565096952908587, | |
| "grad_norm": 0.5058760643005371, | |
| "learning_rate": 5.76191014116711e-07, | |
| "loss": 0.2753, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 2.581717451523546, | |
| "grad_norm": 0.4807314872741699, | |
| "learning_rate": 5.318367983829393e-07, | |
| "loss": 0.2295, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 2.598337950138504, | |
| "grad_norm": 0.4975450336933136, | |
| "learning_rate": 4.891628760948114e-07, | |
| "loss": 0.2623, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 2.6149584487534625, | |
| "grad_norm": 0.44517505168914795, | |
| "learning_rate": 4.481852951692672e-07, | |
| "loss": 0.2505, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 2.6315789473684212, | |
| "grad_norm": 0.526871919631958, | |
| "learning_rate": 4.089194655986306e-07, | |
| "loss": 0.2944, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.6481994459833795, | |
| "grad_norm": 0.5860976576805115, | |
| "learning_rate": 3.7138015365554834e-07, | |
| "loss": 0.2929, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 2.664819944598338, | |
| "grad_norm": 0.5570012927055359, | |
| "learning_rate": 3.355814763399973e-07, | |
| "loss": 0.2669, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 2.6814404432132966, | |
| "grad_norm": 0.46305856108665466, | |
| "learning_rate": 3.015368960704584e-07, | |
| "loss": 0.2464, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 2.698060941828255, | |
| "grad_norm": 0.49931517243385315, | |
| "learning_rate": 2.6925921562124867e-07, | |
| "loss": 0.233, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 2.714681440443213, | |
| "grad_norm": 0.4253719449043274, | |
| "learning_rate": 2.3876057330792344e-07, | |
| "loss": 0.2115, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 2.731301939058172, | |
| "grad_norm": 0.46956562995910645, | |
| "learning_rate": 2.1005243842255552e-07, | |
| "loss": 0.2419, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 2.74792243767313, | |
| "grad_norm": 0.47405821084976196, | |
| "learning_rate": 1.8314560692059836e-07, | |
| "loss": 0.2442, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 2.7645429362880884, | |
| "grad_norm": 0.5373594164848328, | |
| "learning_rate": 1.5805019736097105e-07, | |
| "loss": 0.304, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 2.781163434903047, | |
| "grad_norm": 0.49911409616470337, | |
| "learning_rate": 1.3477564710088097e-07, | |
| "loss": 0.2604, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 2.7977839335180055, | |
| "grad_norm": 0.524211585521698, | |
| "learning_rate": 1.1333070874682217e-07, | |
| "loss": 0.2319, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.8144044321329638, | |
| "grad_norm": 0.49799832701683044, | |
| "learning_rate": 9.372344686307655e-08, | |
| "loss": 0.2648, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 2.8310249307479225, | |
| "grad_norm": 0.4979800581932068, | |
| "learning_rate": 7.59612349389599e-08, | |
| "loss": 0.2671, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 2.847645429362881, | |
| "grad_norm": 0.5030661225318909, | |
| "learning_rate": 6.005075261595495e-08, | |
| "loss": 0.2219, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 2.864265927977839, | |
| "grad_norm": 0.4839530885219574, | |
| "learning_rate": 4.599798317577342e-08, | |
| "loss": 0.2981, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 2.880886426592798, | |
| "grad_norm": 0.49113729596138, | |
| "learning_rate": 3.3808211290284886e-08, | |
| "loss": 0.2574, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 2.897506925207756, | |
| "grad_norm": 0.5154249668121338, | |
| "learning_rate": 2.3486021034170857e-08, | |
| "loss": 0.2584, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 2.914127423822715, | |
| "grad_norm": 0.46952885389328003, | |
| "learning_rate": 1.5035294161039882e-08, | |
| "loss": 0.2785, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 2.930747922437673, | |
| "grad_norm": 0.49860695004463196, | |
| "learning_rate": 8.459208643659122e-09, | |
| "loss": 0.2572, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 2.9473684210526314, | |
| "grad_norm": 0.5341483354568481, | |
| "learning_rate": 3.760237478849793e-09, | |
| "loss": 0.2964, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 2.96398891966759, | |
| "grad_norm": 0.5575993061065674, | |
| "learning_rate": 9.401477574932927e-10, | |
| "loss": 0.2896, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.96398891966759, | |
| "step": 180, | |
| "total_flos": 6.743893969836442e+16, | |
| "train_loss": 0.3866574793226189, | |
| "train_runtime": 24143.75, | |
| "train_samples_per_second": 0.179, | |
| "train_steps_per_second": 0.007 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 180, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.743893969836442e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |