| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 12.269938650306749, | |
| "eval_steps": 10000000, | |
| "global_step": 1000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.12269938650306748, | |
| "grad_norm": 30.28219704365505, | |
| "learning_rate": 2.4691358024691355e-08, | |
| "loss": 2.9799, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.24539877300613497, | |
| "grad_norm": 30.66075380374518, | |
| "learning_rate": 4.938271604938271e-08, | |
| "loss": 2.9842, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.36809815950920244, | |
| "grad_norm": 29.626899456192994, | |
| "learning_rate": 7.407407407407407e-08, | |
| "loss": 3.015, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.49079754601226994, | |
| "grad_norm": 29.470505364249597, | |
| "learning_rate": 9.876543209876542e-08, | |
| "loss": 2.9547, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.6134969325153374, | |
| "grad_norm": 27.407219480683995, | |
| "learning_rate": 1.2345679012345677e-07, | |
| "loss": 2.8784, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.7361963190184049, | |
| "grad_norm": 20.68736366691469, | |
| "learning_rate": 1.4814814814814815e-07, | |
| "loss": 2.7798, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.8588957055214724, | |
| "grad_norm": 17.11747188448127, | |
| "learning_rate": 1.728395061728395e-07, | |
| "loss": 2.5723, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.9815950920245399, | |
| "grad_norm": 7.782003292603646, | |
| "learning_rate": 1.9753086419753084e-07, | |
| "loss": 2.4467, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.1042944785276074, | |
| "grad_norm": 4.365497818859387, | |
| "learning_rate": 2.222222222222222e-07, | |
| "loss": 2.2687, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.2269938650306749, | |
| "grad_norm": 3.3711421712732283, | |
| "learning_rate": 2.4691358024691354e-07, | |
| "loss": 2.1767, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.3496932515337423, | |
| "grad_norm": 2.956591715084401, | |
| "learning_rate": 2.716049382716049e-07, | |
| "loss": 2.1332, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.4723926380368098, | |
| "grad_norm": 2.635796811630019, | |
| "learning_rate": 2.962962962962963e-07, | |
| "loss": 2.1135, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.5950920245398774, | |
| "grad_norm": 2.3273823610234876, | |
| "learning_rate": 3.209876543209876e-07, | |
| "loss": 2.1097, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.7177914110429446, | |
| "grad_norm": 2.2617900128343833, | |
| "learning_rate": 3.45679012345679e-07, | |
| "loss": 2.1161, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.8404907975460123, | |
| "grad_norm": 2.1638223937937076, | |
| "learning_rate": 3.703703703703703e-07, | |
| "loss": 2.0616, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.9631901840490797, | |
| "grad_norm": 2.1437544811484854, | |
| "learning_rate": 3.950617283950617e-07, | |
| "loss": 2.0815, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.085889570552147, | |
| "grad_norm": 2.1420606825819197, | |
| "learning_rate": 4.1975308641975306e-07, | |
| "loss": 2.063, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.208588957055215, | |
| "grad_norm": 2.144143735036332, | |
| "learning_rate": 4.444444444444444e-07, | |
| "loss": 2.0513, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.331288343558282, | |
| "grad_norm": 2.09031994519009, | |
| "learning_rate": 4.6913580246913576e-07, | |
| "loss": 2.0583, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.4539877300613497, | |
| "grad_norm": 2.074953413736776, | |
| "learning_rate": 4.938271604938271e-07, | |
| "loss": 2.0446, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.5766871165644174, | |
| "grad_norm": 2.0364457610758473, | |
| "learning_rate": 5.185185185185185e-07, | |
| "loss": 2.0336, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.6993865030674846, | |
| "grad_norm": 2.09662270077011, | |
| "learning_rate": 5.432098765432098e-07, | |
| "loss": 2.03, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.8220858895705523, | |
| "grad_norm": 2.0184952150772273, | |
| "learning_rate": 5.679012345679012e-07, | |
| "loss": 2.0197, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.9447852760736195, | |
| "grad_norm": 2.2026915181725535, | |
| "learning_rate": 5.925925925925926e-07, | |
| "loss": 2.007, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 3.067484662576687, | |
| "grad_norm": 1.9452709028668997, | |
| "learning_rate": 6.172839506172839e-07, | |
| "loss": 2.0014, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 3.190184049079755, | |
| "grad_norm": 1.933109075200724, | |
| "learning_rate": 6.419753086419752e-07, | |
| "loss": 1.977, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 3.312883435582822, | |
| "grad_norm": 1.9739789693701668, | |
| "learning_rate": 6.666666666666666e-07, | |
| "loss": 1.9816, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.4355828220858897, | |
| "grad_norm": 2.0255729355536625, | |
| "learning_rate": 6.91358024691358e-07, | |
| "loss": 2.0138, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 3.558282208588957, | |
| "grad_norm": 2.109260451722745, | |
| "learning_rate": 7.160493827160494e-07, | |
| "loss": 1.9852, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 3.6809815950920246, | |
| "grad_norm": 2.02230955779347, | |
| "learning_rate": 7.407407407407406e-07, | |
| "loss": 1.989, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 3.8036809815950923, | |
| "grad_norm": 2.114117608740877, | |
| "learning_rate": 7.65432098765432e-07, | |
| "loss": 1.9656, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 3.9263803680981595, | |
| "grad_norm": 1.9608507920028475, | |
| "learning_rate": 7.901234567901234e-07, | |
| "loss": 1.9819, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 4.049079754601227, | |
| "grad_norm": 1.992359665195648, | |
| "learning_rate": 8.148148148148147e-07, | |
| "loss": 1.9517, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 4.171779141104294, | |
| "grad_norm": 2.0184004724781524, | |
| "learning_rate": 8.395061728395061e-07, | |
| "loss": 1.8995, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 4.294478527607362, | |
| "grad_norm": 1.9438721415658846, | |
| "learning_rate": 8.641975308641974e-07, | |
| "loss": 1.9493, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 4.41717791411043, | |
| "grad_norm": 2.037329633517169, | |
| "learning_rate": 8.888888888888888e-07, | |
| "loss": 1.9592, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 4.539877300613497, | |
| "grad_norm": 2.045589243795842, | |
| "learning_rate": 9.135802469135801e-07, | |
| "loss": 1.9269, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 4.662576687116564, | |
| "grad_norm": 2.0376567384336814, | |
| "learning_rate": 9.382716049382715e-07, | |
| "loss": 1.9091, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 4.785276073619632, | |
| "grad_norm": 2.151886931601921, | |
| "learning_rate": 9.629629629629628e-07, | |
| "loss": 1.9338, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 4.9079754601226995, | |
| "grad_norm": 2.1016930186631955, | |
| "learning_rate": 9.876543209876542e-07, | |
| "loss": 1.9361, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 5.030674846625767, | |
| "grad_norm": 1.8425197677510754, | |
| "learning_rate": 9.999953571567085e-07, | |
| "loss": 1.8946, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 5.153374233128835, | |
| "grad_norm": 2.1108335201344826, | |
| "learning_rate": 9.999582149277185e-07, | |
| "loss": 1.8636, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 5.276073619631902, | |
| "grad_norm": 1.965491417959117, | |
| "learning_rate": 9.99883933228855e-07, | |
| "loss": 1.8863, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 5.398773006134969, | |
| "grad_norm": 2.0748380468153442, | |
| "learning_rate": 9.997725175781443e-07, | |
| "loss": 1.8708, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 5.521472392638037, | |
| "grad_norm": 2.0061874641389994, | |
| "learning_rate": 9.99623976252115e-07, | |
| "loss": 1.8629, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 5.644171779141105, | |
| "grad_norm": 1.9227878063794117, | |
| "learning_rate": 9.994383202851812e-07, | |
| "loss": 1.8574, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 5.766871165644172, | |
| "grad_norm": 2.022882706029264, | |
| "learning_rate": 9.992155634688238e-07, | |
| "loss": 1.8489, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 5.889570552147239, | |
| "grad_norm": 1.9151165000656245, | |
| "learning_rate": 9.98955722350566e-07, | |
| "loss": 1.8702, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 6.012269938650307, | |
| "grad_norm": 1.8932452511409288, | |
| "learning_rate": 9.986588162327434e-07, | |
| "loss": 1.8569, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 6.134969325153374, | |
| "grad_norm": 2.0203097592372035, | |
| "learning_rate": 9.983248671710714e-07, | |
| "loss": 1.7873, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 6.257668711656442, | |
| "grad_norm": 1.9081758147724377, | |
| "learning_rate": 9.979538999730047e-07, | |
| "loss": 1.779, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 6.38036809815951, | |
| "grad_norm": 2.072946207781536, | |
| "learning_rate": 9.975459421958967e-07, | |
| "loss": 1.8006, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 6.5030674846625764, | |
| "grad_norm": 2.0035469674735893, | |
| "learning_rate": 9.971010241449513e-07, | |
| "loss": 1.8018, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 6.625766871165644, | |
| "grad_norm": 2.0835946373439143, | |
| "learning_rate": 9.966191788709714e-07, | |
| "loss": 1.8, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 6.748466257668712, | |
| "grad_norm": 2.051477600119681, | |
| "learning_rate": 9.961004421679046e-07, | |
| "loss": 1.7869, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 6.871165644171779, | |
| "grad_norm": 1.9111605839024262, | |
| "learning_rate": 9.955448525701835e-07, | |
| "loss": 1.7929, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 6.993865030674847, | |
| "grad_norm": 2.036265101667604, | |
| "learning_rate": 9.949524513498636e-07, | |
| "loss": 1.8098, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 7.116564417177914, | |
| "grad_norm": 1.9440590850722006, | |
| "learning_rate": 9.943232825135566e-07, | |
| "loss": 1.7158, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 7.2392638036809815, | |
| "grad_norm": 1.9745053511466075, | |
| "learning_rate": 9.93657392799163e-07, | |
| "loss": 1.7269, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 7.361963190184049, | |
| "grad_norm": 2.0172262033751522, | |
| "learning_rate": 9.92954831672398e-07, | |
| "loss": 1.7173, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 7.484662576687117, | |
| "grad_norm": 2.017376130474714, | |
| "learning_rate": 9.922156513231197e-07, | |
| "loss": 1.723, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 7.6073619631901845, | |
| "grad_norm": 2.129118497262339, | |
| "learning_rate": 9.914399066614487e-07, | |
| "loss": 1.7109, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 7.730061349693251, | |
| "grad_norm": 2.0003358633590986, | |
| "learning_rate": 9.906276553136922e-07, | |
| "loss": 1.7287, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 7.852760736196319, | |
| "grad_norm": 2.0380074509433554, | |
| "learning_rate": 9.897789576180616e-07, | |
| "loss": 1.7128, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 7.975460122699387, | |
| "grad_norm": 1.972210672677589, | |
| "learning_rate": 9.888938766201907e-07, | |
| "loss": 1.7209, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 8.098159509202453, | |
| "grad_norm": 2.0353896296888263, | |
| "learning_rate": 9.879724780684517e-07, | |
| "loss": 1.6636, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 8.220858895705522, | |
| "grad_norm": 2.1575258154001067, | |
| "learning_rate": 9.87014830409073e-07, | |
| "loss": 1.6493, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 8.343558282208589, | |
| "grad_norm": 2.0731699660667156, | |
| "learning_rate": 9.860210047810515e-07, | |
| "loss": 1.627, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 8.466257668711656, | |
| "grad_norm": 2.1169883616519476, | |
| "learning_rate": 9.849910750108717e-07, | |
| "loss": 1.6516, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 8.588957055214724, | |
| "grad_norm": 2.0906272175984917, | |
| "learning_rate": 9.839251176070183e-07, | |
| "loss": 1.6619, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 8.71165644171779, | |
| "grad_norm": 2.059390905577669, | |
| "learning_rate": 9.828232117542947e-07, | |
| "loss": 1.6457, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 8.83435582822086, | |
| "grad_norm": 2.111680195842936, | |
| "learning_rate": 9.816854393079402e-07, | |
| "loss": 1.6178, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 8.957055214723926, | |
| "grad_norm": 2.144260583400968, | |
| "learning_rate": 9.805118847875487e-07, | |
| "loss": 1.6382, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 9.079754601226995, | |
| "grad_norm": 2.285829860466323, | |
| "learning_rate": 9.793026353707914e-07, | |
| "loss": 1.5706, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 9.202453987730062, | |
| "grad_norm": 2.153794951665122, | |
| "learning_rate": 9.780577808869398e-07, | |
| "loss": 1.5595, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 9.325153374233128, | |
| "grad_norm": 2.138270605360391, | |
| "learning_rate": 9.767774138101934e-07, | |
| "loss": 1.5649, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 9.447852760736197, | |
| "grad_norm": 2.140711587703231, | |
| "learning_rate": 9.754616292528093e-07, | |
| "loss": 1.5466, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 9.570552147239264, | |
| "grad_norm": 2.221273893753507, | |
| "learning_rate": 9.74110524958038e-07, | |
| "loss": 1.5284, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 9.69325153374233, | |
| "grad_norm": 2.3103597560152545, | |
| "learning_rate": 9.72724201292862e-07, | |
| "loss": 1.5592, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 9.815950920245399, | |
| "grad_norm": 2.2066393219378373, | |
| "learning_rate": 9.713027612405394e-07, | |
| "loss": 1.546, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 9.938650306748466, | |
| "grad_norm": 2.274268167957973, | |
| "learning_rate": 9.698463103929541e-07, | |
| "loss": 1.5556, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 10.061349693251534, | |
| "grad_norm": 2.4288759947142666, | |
| "learning_rate": 9.68354956942773e-07, | |
| "loss": 1.5192, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 10.184049079754601, | |
| "grad_norm": 2.56196295256849, | |
| "learning_rate": 9.668288116754076e-07, | |
| "loss": 1.4731, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 10.30674846625767, | |
| "grad_norm": 2.351647479927511, | |
| "learning_rate": 9.652679879607843e-07, | |
| "loss": 1.4728, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 10.429447852760736, | |
| "grad_norm": 2.3330020594718897, | |
| "learning_rate": 9.636726017449236e-07, | |
| "loss": 1.4558, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 10.552147239263803, | |
| "grad_norm": 2.420606264665063, | |
| "learning_rate": 9.62042771541326e-07, | |
| "loss": 1.4287, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 10.674846625766872, | |
| "grad_norm": 2.3947612477487867, | |
| "learning_rate": 9.603786184221692e-07, | |
| "loss": 1.4469, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 10.797546012269938, | |
| "grad_norm": 2.3399004198935303, | |
| "learning_rate": 9.586802660093136e-07, | |
| "loss": 1.4396, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 10.920245398773005, | |
| "grad_norm": 2.387016507283165, | |
| "learning_rate": 9.56947840465119e-07, | |
| "loss": 1.4595, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 11.042944785276074, | |
| "grad_norm": 2.533848992609839, | |
| "learning_rate": 9.551814704830734e-07, | |
| "loss": 1.417, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 11.16564417177914, | |
| "grad_norm": 2.5734232080415254, | |
| "learning_rate": 9.533812872782313e-07, | |
| "loss": 1.348, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 11.28834355828221, | |
| "grad_norm": 2.6920925651578087, | |
| "learning_rate": 9.515474245774684e-07, | |
| "loss": 1.3556, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 11.411042944785276, | |
| "grad_norm": 2.7013002533947574, | |
| "learning_rate": 9.496800186095465e-07, | |
| "loss": 1.3539, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 11.533742331288344, | |
| "grad_norm": 2.731007254322984, | |
| "learning_rate": 9.477792080949938e-07, | |
| "loss": 1.3436, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 11.656441717791411, | |
| "grad_norm": 2.8652743906067375, | |
| "learning_rate": 9.458451342358e-07, | |
| "loss": 1.3521, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 11.779141104294478, | |
| "grad_norm": 2.5726368131527075, | |
| "learning_rate": 9.43877940704928e-07, | |
| "loss": 1.3426, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 11.901840490797547, | |
| "grad_norm": 2.6814604487668463, | |
| "learning_rate": 9.418777736356393e-07, | |
| "loss": 1.3419, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 12.024539877300613, | |
| "grad_norm": 3.1868357701860774, | |
| "learning_rate": 9.39844781610641e-07, | |
| "loss": 1.3155, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 12.14723926380368, | |
| "grad_norm": 3.082069159033982, | |
| "learning_rate": 9.377791156510454e-07, | |
| "loss": 1.2428, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 12.269938650306749, | |
| "grad_norm": 3.075730479893534, | |
| "learning_rate": 9.356809292051539e-07, | |
| "loss": 1.224, | |
| "step": 1000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 4050, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 92928324206592.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |