| { | |
| "best_global_step": 22000, | |
| "best_metric": 0.40355798602104187, | |
| "best_model_checkpoint": "Qwen-3-0.6B-it-Medical-LoRA/checkpoint-22000", | |
| "epoch": 49.998867497168746, | |
| "eval_steps": 100, | |
| "global_step": 22050, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.11325028312570781, | |
| "grad_norm": 0.3080866038799286, | |
| "learning_rate": 0.00019075425790754258, | |
| "loss": 1.8711, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.22650056625141562, | |
| "grad_norm": 0.28974413871765137, | |
| "learning_rate": 0.0001664233576642336, | |
| "loss": 1.338, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.22650056625141562, | |
| "eval_loss": 1.3033037185668945, | |
| "eval_runtime": 217.1489, | |
| "eval_samples_per_second": 7.225, | |
| "eval_steps_per_second": 0.907, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.33975084937712347, | |
| "grad_norm": 0.3371483385562897, | |
| "learning_rate": 0.0001420924574209246, | |
| "loss": 1.2923, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.45300113250283125, | |
| "grad_norm": 0.35730767250061035, | |
| "learning_rate": 0.00011776155717761557, | |
| "loss": 1.2703, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.45300113250283125, | |
| "eval_loss": 1.2490053176879883, | |
| "eval_runtime": 217.4793, | |
| "eval_samples_per_second": 7.214, | |
| "eval_steps_per_second": 0.906, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5662514156285391, | |
| "grad_norm": 0.35110780596733093, | |
| "learning_rate": 9.343065693430657e-05, | |
| "loss": 1.2397, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6795016987542469, | |
| "grad_norm": 0.35077276825904846, | |
| "learning_rate": 6.909975669099758e-05, | |
| "loss": 1.232, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6795016987542469, | |
| "eval_loss": 1.2211977243423462, | |
| "eval_runtime": 217.5364, | |
| "eval_samples_per_second": 7.213, | |
| "eval_steps_per_second": 0.906, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7927519818799547, | |
| "grad_norm": 0.3939191699028015, | |
| "learning_rate": 4.476885644768857e-05, | |
| "loss": 1.2241, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.9060022650056625, | |
| "grad_norm": 0.366871178150177, | |
| "learning_rate": 2.0437956204379563e-05, | |
| "loss": 1.2078, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.9060022650056625, | |
| "eval_loss": 1.2062289714813232, | |
| "eval_runtime": 217.4838, | |
| "eval_samples_per_second": 7.214, | |
| "eval_steps_per_second": 0.906, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.0203850509626273, | |
| "grad_norm": 0.3808969259262085, | |
| "learning_rate": 0.00010164319248826291, | |
| "loss": 1.1819, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.1336353340883352, | |
| "grad_norm": 0.43216949701309204, | |
| "learning_rate": 8.990610328638498e-05, | |
| "loss": 1.19, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.1336353340883352, | |
| "eval_loss": 1.1994948387145996, | |
| "eval_runtime": 217.3626, | |
| "eval_samples_per_second": 7.218, | |
| "eval_steps_per_second": 0.906, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.246885617214043, | |
| "grad_norm": 0.4280295968055725, | |
| "learning_rate": 7.816901408450704e-05, | |
| "loss": 1.1971, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.3601359003397508, | |
| "grad_norm": 0.4056779146194458, | |
| "learning_rate": 6.643192488262912e-05, | |
| "loss": 1.1771, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.3601359003397508, | |
| "eval_loss": 1.1834282875061035, | |
| "eval_runtime": 217.4514, | |
| "eval_samples_per_second": 7.215, | |
| "eval_steps_per_second": 0.906, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.4733861834654587, | |
| "grad_norm": 0.4397243857383728, | |
| "learning_rate": 5.469483568075118e-05, | |
| "loss": 1.1544, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.5866364665911665, | |
| "grad_norm": 0.4214654862880707, | |
| "learning_rate": 4.295774647887324e-05, | |
| "loss": 1.1789, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.5866364665911665, | |
| "eval_loss": 1.1712530851364136, | |
| "eval_runtime": 217.6023, | |
| "eval_samples_per_second": 7.21, | |
| "eval_steps_per_second": 0.905, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.6998867497168741, | |
| "grad_norm": 0.43076109886169434, | |
| "learning_rate": 3.1220657276995305e-05, | |
| "loss": 1.1522, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.8131370328425822, | |
| "grad_norm": 0.4253358244895935, | |
| "learning_rate": 1.9483568075117372e-05, | |
| "loss": 1.1508, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.8131370328425822, | |
| "eval_loss": 1.1622345447540283, | |
| "eval_runtime": 217.3951, | |
| "eval_samples_per_second": 7.217, | |
| "eval_steps_per_second": 0.906, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.9263873159682898, | |
| "grad_norm": 0.4359077215194702, | |
| "learning_rate": 7.746478873239436e-06, | |
| "loss": 1.1422, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.0407701019252547, | |
| "grad_norm": 0.4511992633342743, | |
| "learning_rate": 0.00014314687602224403, | |
| "loss": 1.149, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.0407701019252547, | |
| "eval_loss": 1.167581558227539, | |
| "eval_runtime": 218.6888, | |
| "eval_samples_per_second": 7.175, | |
| "eval_steps_per_second": 0.901, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.1540203850509627, | |
| "grad_norm": 0.47519659996032715, | |
| "learning_rate": 0.00013987569512594048, | |
| "loss": 1.1498, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.2672706681766703, | |
| "grad_norm": 0.4559363126754761, | |
| "learning_rate": 0.00013660451422963692, | |
| "loss": 1.1469, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.2672706681766703, | |
| "eval_loss": 1.1536333560943604, | |
| "eval_runtime": 218.921, | |
| "eval_samples_per_second": 7.167, | |
| "eval_steps_per_second": 0.9, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.3805209513023784, | |
| "grad_norm": 0.49805569648742676, | |
| "learning_rate": 0.00013333333333333334, | |
| "loss": 1.123, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.493771234428086, | |
| "grad_norm": 0.4767671823501587, | |
| "learning_rate": 0.00013006215243702978, | |
| "loss": 1.1119, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.493771234428086, | |
| "eval_loss": 1.1321617364883423, | |
| "eval_runtime": 218.8468, | |
| "eval_samples_per_second": 7.169, | |
| "eval_steps_per_second": 0.9, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.607021517553794, | |
| "grad_norm": 0.4313490688800812, | |
| "learning_rate": 0.0001267909715407262, | |
| "loss": 1.1096, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.7202718006795017, | |
| "grad_norm": 0.46401792764663696, | |
| "learning_rate": 0.00012351979064442265, | |
| "loss": 1.0929, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.7202718006795017, | |
| "eval_loss": 1.1138092279434204, | |
| "eval_runtime": 219.2403, | |
| "eval_samples_per_second": 7.157, | |
| "eval_steps_per_second": 0.899, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.8335220838052093, | |
| "grad_norm": 0.46533071994781494, | |
| "learning_rate": 0.00012024860974811907, | |
| "loss": 1.0894, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.9467723669309174, | |
| "grad_norm": 0.42782357335090637, | |
| "learning_rate": 0.00011697742885181551, | |
| "loss": 1.072, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.9467723669309174, | |
| "eval_loss": 1.0986168384552002, | |
| "eval_runtime": 219.0662, | |
| "eval_samples_per_second": 7.162, | |
| "eval_steps_per_second": 0.899, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 3.061155152887882, | |
| "grad_norm": 0.46029889583587646, | |
| "learning_rate": 0.00011370624795551194, | |
| "loss": 1.0936, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 3.17440543601359, | |
| "grad_norm": 0.5067735314369202, | |
| "learning_rate": 0.00011043506705920839, | |
| "loss": 1.0304, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 3.17440543601359, | |
| "eval_loss": 1.0839864015579224, | |
| "eval_runtime": 219.2396, | |
| "eval_samples_per_second": 7.157, | |
| "eval_steps_per_second": 0.899, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 3.287655719139298, | |
| "grad_norm": 0.46760454773902893, | |
| "learning_rate": 0.0001071638861629048, | |
| "loss": 1.0361, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 3.4009060022650055, | |
| "grad_norm": 0.5199077129364014, | |
| "learning_rate": 0.00010389270526660124, | |
| "loss": 1.0304, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 3.4009060022650055, | |
| "eval_loss": 1.070657730102539, | |
| "eval_runtime": 219.1222, | |
| "eval_samples_per_second": 7.16, | |
| "eval_steps_per_second": 0.899, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 3.5141562853907136, | |
| "grad_norm": 0.496124267578125, | |
| "learning_rate": 0.00010062152437029768, | |
| "loss": 1.0202, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 3.627406568516421, | |
| "grad_norm": 0.5154497623443604, | |
| "learning_rate": 9.735034347399413e-05, | |
| "loss": 1.007, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 3.627406568516421, | |
| "eval_loss": 1.0554137229919434, | |
| "eval_runtime": 219.2648, | |
| "eval_samples_per_second": 7.156, | |
| "eval_steps_per_second": 0.898, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 3.7406568516421292, | |
| "grad_norm": 0.4881006181240082, | |
| "learning_rate": 9.407916257769055e-05, | |
| "loss": 0.9934, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 3.853907134767837, | |
| "grad_norm": 0.5507743954658508, | |
| "learning_rate": 9.080798168138699e-05, | |
| "loss": 0.9894, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 3.853907134767837, | |
| "eval_loss": 1.0418345928192139, | |
| "eval_runtime": 219.154, | |
| "eval_samples_per_second": 7.159, | |
| "eval_steps_per_second": 0.899, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 3.967157417893545, | |
| "grad_norm": 0.5333808064460754, | |
| "learning_rate": 8.753680078508342e-05, | |
| "loss": 0.9795, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 4.081540203850509, | |
| "grad_norm": 0.551164448261261, | |
| "learning_rate": 8.426561988877985e-05, | |
| "loss": 0.974, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 4.081540203850509, | |
| "eval_loss": 1.0327671766281128, | |
| "eval_runtime": 219.1392, | |
| "eval_samples_per_second": 7.16, | |
| "eval_steps_per_second": 0.899, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 4.194790486976218, | |
| "grad_norm": 0.5678717494010925, | |
| "learning_rate": 8.099443899247629e-05, | |
| "loss": 0.9204, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 4.308040770101925, | |
| "grad_norm": 0.5472707152366638, | |
| "learning_rate": 7.772325809617273e-05, | |
| "loss": 0.9341, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 4.308040770101925, | |
| "eval_loss": 1.0188047885894775, | |
| "eval_runtime": 219.1338, | |
| "eval_samples_per_second": 7.16, | |
| "eval_steps_per_second": 0.899, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 4.421291053227633, | |
| "grad_norm": 0.5799363255500793, | |
| "learning_rate": 7.445207719986915e-05, | |
| "loss": 0.9313, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 4.534541336353341, | |
| "grad_norm": 0.6046631336212158, | |
| "learning_rate": 7.11808963035656e-05, | |
| "loss": 0.9325, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 4.534541336353341, | |
| "eval_loss": 1.004631519317627, | |
| "eval_runtime": 219.1114, | |
| "eval_samples_per_second": 7.161, | |
| "eval_steps_per_second": 0.899, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 4.647791619479049, | |
| "grad_norm": 0.5897740721702576, | |
| "learning_rate": 6.790971540726203e-05, | |
| "loss": 0.9213, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 4.761041902604757, | |
| "grad_norm": 0.583991289138794, | |
| "learning_rate": 6.463853451095846e-05, | |
| "loss": 0.9039, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 4.761041902604757, | |
| "eval_loss": 0.9938989877700806, | |
| "eval_runtime": 219.0293, | |
| "eval_samples_per_second": 7.163, | |
| "eval_steps_per_second": 0.899, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 4.874292185730464, | |
| "grad_norm": 0.6264305710792542, | |
| "learning_rate": 6.13673536146549e-05, | |
| "loss": 0.9028, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 4.987542468856172, | |
| "grad_norm": 0.6474761962890625, | |
| "learning_rate": 5.809617271835133e-05, | |
| "loss": 0.9053, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 4.987542468856172, | |
| "eval_loss": 0.9845430254936218, | |
| "eval_runtime": 219.2502, | |
| "eval_samples_per_second": 7.156, | |
| "eval_steps_per_second": 0.899, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 5.101925254813137, | |
| "grad_norm": 0.6595875024795532, | |
| "learning_rate": 5.4824991822047765e-05, | |
| "loss": 0.882, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 5.215175537938845, | |
| "grad_norm": 0.6405232548713684, | |
| "learning_rate": 5.15538109257442e-05, | |
| "loss": 0.8471, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 5.215175537938845, | |
| "eval_loss": 0.9782047867774963, | |
| "eval_runtime": 219.0751, | |
| "eval_samples_per_second": 7.162, | |
| "eval_steps_per_second": 0.899, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 5.3284258210645525, | |
| "grad_norm": 0.6547350287437439, | |
| "learning_rate": 4.828263002944063e-05, | |
| "loss": 0.8602, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 5.44167610419026, | |
| "grad_norm": 0.7046269178390503, | |
| "learning_rate": 4.501144913313706e-05, | |
| "loss": 0.8404, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 5.44167610419026, | |
| "eval_loss": 0.9688066244125366, | |
| "eval_runtime": 219.0622, | |
| "eval_samples_per_second": 7.162, | |
| "eval_steps_per_second": 0.899, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 5.554926387315969, | |
| "grad_norm": 0.6331756114959717, | |
| "learning_rate": 4.17402682368335e-05, | |
| "loss": 0.8286, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 5.668176670441676, | |
| "grad_norm": 0.7212900519371033, | |
| "learning_rate": 3.846908734052994e-05, | |
| "loss": 0.8382, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 5.668176670441676, | |
| "eval_loss": 0.9589976668357849, | |
| "eval_runtime": 219.1257, | |
| "eval_samples_per_second": 7.16, | |
| "eval_steps_per_second": 0.899, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 5.781426953567384, | |
| "grad_norm": 0.6771254539489746, | |
| "learning_rate": 3.519790644422637e-05, | |
| "loss": 0.8359, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 5.8946772366930915, | |
| "grad_norm": 0.7171376943588257, | |
| "learning_rate": 3.19267255479228e-05, | |
| "loss": 0.832, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 5.8946772366930915, | |
| "eval_loss": 0.9501948952674866, | |
| "eval_runtime": 219.0768, | |
| "eval_samples_per_second": 7.162, | |
| "eval_steps_per_second": 0.899, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 6.009060022650057, | |
| "grad_norm": 0.6734739542007446, | |
| "learning_rate": 2.865554465161924e-05, | |
| "loss": 0.8437, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 6.122310305775764, | |
| "grad_norm": 0.697407603263855, | |
| "learning_rate": 2.538436375531567e-05, | |
| "loss": 0.7937, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 6.122310305775764, | |
| "eval_loss": 0.9480313658714294, | |
| "eval_runtime": 218.949, | |
| "eval_samples_per_second": 7.166, | |
| "eval_steps_per_second": 0.9, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 6.235560588901472, | |
| "grad_norm": 0.7092292904853821, | |
| "learning_rate": 2.2113182859012105e-05, | |
| "loss": 0.7804, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 6.34881087202718, | |
| "grad_norm": 0.7284964919090271, | |
| "learning_rate": 1.884200196270854e-05, | |
| "loss": 0.7861, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 6.34881087202718, | |
| "eval_loss": 0.942541241645813, | |
| "eval_runtime": 219.1707, | |
| "eval_samples_per_second": 7.159, | |
| "eval_steps_per_second": 0.899, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 6.462061155152888, | |
| "grad_norm": 0.7725135087966919, | |
| "learning_rate": 1.557082106640497e-05, | |
| "loss": 0.776, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 6.575311438278596, | |
| "grad_norm": 0.7266800403594971, | |
| "learning_rate": 1.2299640170101408e-05, | |
| "loss": 0.7812, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 6.575311438278596, | |
| "eval_loss": 0.939509928226471, | |
| "eval_runtime": 219.1206, | |
| "eval_samples_per_second": 7.16, | |
| "eval_steps_per_second": 0.899, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 6.688561721404303, | |
| "grad_norm": 0.7308298349380493, | |
| "learning_rate": 9.028459273797842e-06, | |
| "loss": 0.7827, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 6.801812004530011, | |
| "grad_norm": 0.7362912893295288, | |
| "learning_rate": 5.757278377494276e-06, | |
| "loss": 0.7917, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 6.801812004530011, | |
| "eval_loss": 0.9356247782707214, | |
| "eval_runtime": 219.052, | |
| "eval_samples_per_second": 7.163, | |
| "eval_steps_per_second": 0.899, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 6.9150622876557195, | |
| "grad_norm": 0.7543765902519226, | |
| "learning_rate": 2.4860974811907098e-06, | |
| "loss": 0.7738, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 7.029445073612684, | |
| "grad_norm": 0.7134389877319336, | |
| "learning_rate": 5.986301369863014e-05, | |
| "loss": 0.7481, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 7.029445073612684, | |
| "eval_loss": 0.9381225109100342, | |
| "eval_runtime": 218.2269, | |
| "eval_samples_per_second": 7.19, | |
| "eval_steps_per_second": 0.903, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 7.1426953567383915, | |
| "grad_norm": 0.8131405711174011, | |
| "learning_rate": 5.757990867579909e-05, | |
| "loss": 0.7725, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 7.2559456398641, | |
| "grad_norm": 0.8759368062019348, | |
| "learning_rate": 5.529680365296805e-05, | |
| "loss": 0.7752, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 7.2559456398641, | |
| "eval_loss": 0.9386877417564392, | |
| "eval_runtime": 218.4203, | |
| "eval_samples_per_second": 7.183, | |
| "eval_steps_per_second": 0.902, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 7.369195922989808, | |
| "grad_norm": 0.8374108076095581, | |
| "learning_rate": 5.3013698630136986e-05, | |
| "loss": 0.7765, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 7.482446206115515, | |
| "grad_norm": 0.8505973815917969, | |
| "learning_rate": 5.0730593607305946e-05, | |
| "loss": 0.7791, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 7.482446206115515, | |
| "eval_loss": 0.9266760349273682, | |
| "eval_runtime": 218.4773, | |
| "eval_samples_per_second": 7.182, | |
| "eval_steps_per_second": 0.902, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 7.595696489241223, | |
| "grad_norm": 0.8420349359512329, | |
| "learning_rate": 4.8447488584474886e-05, | |
| "loss": 0.7721, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 7.7089467723669305, | |
| "grad_norm": 0.892084002494812, | |
| "learning_rate": 4.616438356164384e-05, | |
| "loss": 0.7626, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 7.7089467723669305, | |
| "eval_loss": 0.9153051376342773, | |
| "eval_runtime": 218.391, | |
| "eval_samples_per_second": 7.184, | |
| "eval_steps_per_second": 0.902, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 7.822197055492639, | |
| "grad_norm": 1.0072320699691772, | |
| "learning_rate": 4.3881278538812785e-05, | |
| "loss": 0.7578, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 7.935447338618347, | |
| "grad_norm": 0.841740608215332, | |
| "learning_rate": 4.159817351598174e-05, | |
| "loss": 0.755, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 7.935447338618347, | |
| "eval_loss": 0.9030627012252808, | |
| "eval_runtime": 218.5627, | |
| "eval_samples_per_second": 7.179, | |
| "eval_steps_per_second": 0.901, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 8.049830124575312, | |
| "grad_norm": 0.9417058825492859, | |
| "learning_rate": 3.9315068493150684e-05, | |
| "loss": 0.7419, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 8.163080407701019, | |
| "grad_norm": 0.8208181858062744, | |
| "learning_rate": 3.703196347031964e-05, | |
| "loss": 0.7079, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 8.163080407701019, | |
| "eval_loss": 0.9004995226860046, | |
| "eval_runtime": 218.599, | |
| "eval_samples_per_second": 7.178, | |
| "eval_steps_per_second": 0.901, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 8.276330690826727, | |
| "grad_norm": 0.8969956040382385, | |
| "learning_rate": 3.4748858447488584e-05, | |
| "loss": 0.7184, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 8.389580973952436, | |
| "grad_norm": 0.9903959631919861, | |
| "learning_rate": 3.246575342465754e-05, | |
| "loss": 0.6977, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 8.389580973952436, | |
| "eval_loss": 0.895404577255249, | |
| "eval_runtime": 218.551, | |
| "eval_samples_per_second": 7.179, | |
| "eval_steps_per_second": 0.901, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 8.502831257078142, | |
| "grad_norm": 0.8987964391708374, | |
| "learning_rate": 3.0182648401826487e-05, | |
| "loss": 0.6981, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 8.61608154020385, | |
| "grad_norm": 0.9351384043693542, | |
| "learning_rate": 2.7899543378995436e-05, | |
| "loss": 0.6985, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 8.61608154020385, | |
| "eval_loss": 0.8867019414901733, | |
| "eval_runtime": 218.5395, | |
| "eval_samples_per_second": 7.179, | |
| "eval_steps_per_second": 0.901, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 8.729331823329558, | |
| "grad_norm": 0.9520925283432007, | |
| "learning_rate": 2.5616438356164386e-05, | |
| "loss": 0.7041, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 8.842582106455266, | |
| "grad_norm": 0.9150193333625793, | |
| "learning_rate": 2.3333333333333336e-05, | |
| "loss": 0.6946, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 8.842582106455266, | |
| "eval_loss": 0.8767301440238953, | |
| "eval_runtime": 218.4183, | |
| "eval_samples_per_second": 7.183, | |
| "eval_steps_per_second": 0.902, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 8.955832389580975, | |
| "grad_norm": 0.9718352556228638, | |
| "learning_rate": 2.1050228310502286e-05, | |
| "loss": 0.6837, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 9.070215175537939, | |
| "grad_norm": 0.9025393724441528, | |
| "learning_rate": 1.8767123287671235e-05, | |
| "loss": 0.6821, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 9.070215175537939, | |
| "eval_loss": 0.8735217452049255, | |
| "eval_runtime": 218.5455, | |
| "eval_samples_per_second": 7.179, | |
| "eval_steps_per_second": 0.901, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 9.183465458663647, | |
| "grad_norm": 0.9804911017417908, | |
| "learning_rate": 1.6484018264840185e-05, | |
| "loss": 0.6533, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 9.296715741789354, | |
| "grad_norm": 0.8889093399047852, | |
| "learning_rate": 1.4200913242009135e-05, | |
| "loss": 0.6549, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 9.296715741789354, | |
| "eval_loss": 0.8693042993545532, | |
| "eval_runtime": 218.4928, | |
| "eval_samples_per_second": 7.181, | |
| "eval_steps_per_second": 0.902, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 9.409966024915063, | |
| "grad_norm": 0.9306142926216125, | |
| "learning_rate": 1.1917808219178083e-05, | |
| "loss": 0.643, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 9.52321630804077, | |
| "grad_norm": 1.0180792808532715, | |
| "learning_rate": 9.634703196347032e-06, | |
| "loss": 0.6498, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 9.52321630804077, | |
| "eval_loss": 0.8649076223373413, | |
| "eval_runtime": 218.6148, | |
| "eval_samples_per_second": 7.177, | |
| "eval_steps_per_second": 0.901, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 9.636466591166478, | |
| "grad_norm": 1.038870930671692, | |
| "learning_rate": 7.351598173515982e-06, | |
| "loss": 0.6633, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 9.749716874292186, | |
| "grad_norm": 0.9064520001411438, | |
| "learning_rate": 5.068493150684932e-06, | |
| "loss": 0.6503, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 9.749716874292186, | |
| "eval_loss": 0.8624854683876038, | |
| "eval_runtime": 218.607, | |
| "eval_samples_per_second": 7.177, | |
| "eval_steps_per_second": 0.901, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 9.862967157417893, | |
| "grad_norm": 0.9673233032226562, | |
| "learning_rate": 2.7853881278538815e-06, | |
| "loss": 0.6439, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 9.976217440543602, | |
| "grad_norm": 0.9512138366699219, | |
| "learning_rate": 5.022831050228311e-07, | |
| "loss": 0.6427, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 9.976217440543602, | |
| "eval_loss": 0.8610817790031433, | |
| "eval_runtime": 218.4692, | |
| "eval_samples_per_second": 7.182, | |
| "eval_steps_per_second": 0.902, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 10.090600226500566, | |
| "grad_norm": 0.9980069994926453, | |
| "learning_rate": 4.5028932140978435e-05, | |
| "loss": 0.6435, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 10.203850509626275, | |
| "grad_norm": 1.1042736768722534, | |
| "learning_rate": 4.327546905137647e-05, | |
| "loss": 0.6473, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 10.203850509626275, | |
| "eval_loss": 0.8664056658744812, | |
| "eval_runtime": 218.1832, | |
| "eval_samples_per_second": 7.191, | |
| "eval_steps_per_second": 0.903, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 10.317100792751981, | |
| "grad_norm": 1.093027114868164, | |
| "learning_rate": 4.1522005961774504e-05, | |
| "loss": 0.6428, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 10.43035107587769, | |
| "grad_norm": 1.1941519975662231, | |
| "learning_rate": 3.9768542872172545e-05, | |
| "loss": 0.6453, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 10.43035107587769, | |
| "eval_loss": 0.8576545715332031, | |
| "eval_runtime": 218.2133, | |
| "eval_samples_per_second": 7.19, | |
| "eval_steps_per_second": 0.903, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 10.543601359003397, | |
| "grad_norm": 1.1875131130218506, | |
| "learning_rate": 3.801507978257058e-05, | |
| "loss": 0.6444, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 10.656851642129105, | |
| "grad_norm": 1.057826042175293, | |
| "learning_rate": 3.6261616692968614e-05, | |
| "loss": 0.6374, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 10.656851642129105, | |
| "eval_loss": 0.8471001982688904, | |
| "eval_runtime": 218.248, | |
| "eval_samples_per_second": 7.189, | |
| "eval_steps_per_second": 0.903, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 10.770101925254814, | |
| "grad_norm": 1.1481099128723145, | |
| "learning_rate": 3.450815360336665e-05, | |
| "loss": 0.6367, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 10.88335220838052, | |
| "grad_norm": 1.043562412261963, | |
| "learning_rate": 3.275469051376468e-05, | |
| "loss": 0.6382, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 10.88335220838052, | |
| "eval_loss": 0.8414534330368042, | |
| "eval_runtime": 218.3266, | |
| "eval_samples_per_second": 7.186, | |
| "eval_steps_per_second": 0.902, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 10.996602491506229, | |
| "grad_norm": 1.1026701927185059, | |
| "learning_rate": 3.1001227424162724e-05, | |
| "loss": 0.6363, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 11.110985277463193, | |
| "grad_norm": 1.2548056840896606, | |
| "learning_rate": 2.9247764334560758e-05, | |
| "loss": 0.6197, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 11.110985277463193, | |
| "eval_loss": 0.8344885110855103, | |
| "eval_runtime": 218.346, | |
| "eval_samples_per_second": 7.186, | |
| "eval_steps_per_second": 0.902, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 11.224235560588902, | |
| "grad_norm": 1.2327723503112793, | |
| "learning_rate": 2.7494301244958792e-05, | |
| "loss": 0.5955, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 11.337485843714608, | |
| "grad_norm": 1.272136926651001, | |
| "learning_rate": 2.5740838155356834e-05, | |
| "loss": 0.5888, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 11.337485843714608, | |
| "eval_loss": 0.8296782374382019, | |
| "eval_runtime": 218.3829, | |
| "eval_samples_per_second": 7.185, | |
| "eval_steps_per_second": 0.902, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 11.450736126840317, | |
| "grad_norm": 1.3154362440109253, | |
| "learning_rate": 2.3987375065754868e-05, | |
| "loss": 0.5821, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 11.563986409966025, | |
| "grad_norm": 1.2641000747680664, | |
| "learning_rate": 2.2233911976152902e-05, | |
| "loss": 0.5786, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 11.563986409966025, | |
| "eval_loss": 0.8227117657661438, | |
| "eval_runtime": 218.3919, | |
| "eval_samples_per_second": 7.184, | |
| "eval_steps_per_second": 0.902, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 11.677236693091732, | |
| "grad_norm": 1.308750033378601, | |
| "learning_rate": 2.048044888655094e-05, | |
| "loss": 0.5876, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 11.79048697621744, | |
| "grad_norm": 1.2791666984558105, | |
| "learning_rate": 1.8726985796948974e-05, | |
| "loss": 0.5886, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 11.79048697621744, | |
| "eval_loss": 0.8168981075286865, | |
| "eval_runtime": 218.4082, | |
| "eval_samples_per_second": 7.184, | |
| "eval_steps_per_second": 0.902, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 11.90373725934315, | |
| "grad_norm": 1.1309980154037476, | |
| "learning_rate": 1.6973522707347012e-05, | |
| "loss": 0.5816, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 12.018120045300114, | |
| "grad_norm": 1.2232533693313599, | |
| "learning_rate": 1.5220059617745046e-05, | |
| "loss": 0.5993, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 12.018120045300114, | |
| "eval_loss": 0.8122690916061401, | |
| "eval_runtime": 218.3894, | |
| "eval_samples_per_second": 7.184, | |
| "eval_steps_per_second": 0.902, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 12.13137032842582, | |
| "grad_norm": 1.1197330951690674, | |
| "learning_rate": 1.3466596528143083e-05, | |
| "loss": 0.552, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 12.244620611551529, | |
| "grad_norm": 1.038383960723877, | |
| "learning_rate": 1.171313343854112e-05, | |
| "loss": 0.5461, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 12.244620611551529, | |
| "eval_loss": 0.810990571975708, | |
| "eval_runtime": 218.3359, | |
| "eval_samples_per_second": 7.186, | |
| "eval_steps_per_second": 0.902, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 12.357870894677237, | |
| "grad_norm": 1.2155468463897705, | |
| "learning_rate": 9.959670348939155e-06, | |
| "loss": 0.5487, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 12.471121177802944, | |
| "grad_norm": 1.0609550476074219, | |
| "learning_rate": 8.20620725933719e-06, | |
| "loss": 0.5524, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 12.471121177802944, | |
| "eval_loss": 0.8065800070762634, | |
| "eval_runtime": 218.3379, | |
| "eval_samples_per_second": 7.186, | |
| "eval_steps_per_second": 0.902, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 12.584371460928653, | |
| "grad_norm": 1.1328603029251099, | |
| "learning_rate": 6.452744169735227e-06, | |
| "loss": 0.5437, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 12.69762174405436, | |
| "grad_norm": 1.0544012784957886, | |
| "learning_rate": 4.699281080133264e-06, | |
| "loss": 0.5521, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 12.69762174405436, | |
| "eval_loss": 0.8028028607368469, | |
| "eval_runtime": 218.3014, | |
| "eval_samples_per_second": 7.187, | |
| "eval_steps_per_second": 0.902, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 12.810872027180068, | |
| "grad_norm": 1.1334656476974487, | |
| "learning_rate": 2.9458179905312994e-06, | |
| "loss": 0.5537, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 12.924122310305776, | |
| "grad_norm": 1.133388638496399, | |
| "learning_rate": 1.1923549009293354e-06, | |
| "loss": 0.5502, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 12.924122310305776, | |
| "eval_loss": 0.801500141620636, | |
| "eval_runtime": 218.1671, | |
| "eval_samples_per_second": 7.192, | |
| "eval_steps_per_second": 0.903, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 13.03850509626274, | |
| "grad_norm": 1.093996524810791, | |
| "learning_rate": 2.6302201974183753e-05, | |
| "loss": 0.5348, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 13.15175537938845, | |
| "grad_norm": 1.1750720739364624, | |
| "learning_rate": 2.478359908883827e-05, | |
| "loss": 0.5407, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 13.15175537938845, | |
| "eval_loss": 0.8037387728691101, | |
| "eval_runtime": 219.5954, | |
| "eval_samples_per_second": 7.145, | |
| "eval_steps_per_second": 0.897, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 13.265005662514156, | |
| "grad_norm": 1.3370305299758911, | |
| "learning_rate": 2.3264996203492785e-05, | |
| "loss": 0.5427, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 13.378255945639864, | |
| "grad_norm": 1.361132025718689, | |
| "learning_rate": 2.1746393318147306e-05, | |
| "loss": 0.5587, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 13.378255945639864, | |
| "eval_loss": 0.7990919351577759, | |
| "eval_runtime": 219.6972, | |
| "eval_samples_per_second": 7.142, | |
| "eval_steps_per_second": 0.897, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 13.491506228765571, | |
| "grad_norm": 1.2097536325454712, | |
| "learning_rate": 2.0227790432801824e-05, | |
| "loss": 0.5378, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 13.60475651189128, | |
| "grad_norm": 2.1065151691436768, | |
| "learning_rate": 1.8709187547456342e-05, | |
| "loss": 0.538, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 13.60475651189128, | |
| "eval_loss": 0.7955650091171265, | |
| "eval_runtime": 219.6995, | |
| "eval_samples_per_second": 7.142, | |
| "eval_steps_per_second": 0.897, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 13.718006795016988, | |
| "grad_norm": 1.3054521083831787, | |
| "learning_rate": 1.719058466211086e-05, | |
| "loss": 0.5319, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 13.831257078142695, | |
| "grad_norm": 1.2116392850875854, | |
| "learning_rate": 1.5671981776765377e-05, | |
| "loss": 0.5382, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 13.831257078142695, | |
| "eval_loss": 0.7880010008811951, | |
| "eval_runtime": 219.8172, | |
| "eval_samples_per_second": 7.138, | |
| "eval_steps_per_second": 0.896, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 13.944507361268403, | |
| "grad_norm": 1.437024474143982, | |
| "learning_rate": 1.4153378891419893e-05, | |
| "loss": 0.5361, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 14.058890147225368, | |
| "grad_norm": 1.1516680717468262, | |
| "learning_rate": 1.2665148063781321e-05, | |
| "loss": 0.5453, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 14.058890147225368, | |
| "eval_loss": 0.783509373664856, | |
| "eval_runtime": 219.7823, | |
| "eval_samples_per_second": 7.139, | |
| "eval_steps_per_second": 0.896, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 14.172140430351076, | |
| "grad_norm": 1.182915210723877, | |
| "learning_rate": 1.114654517843584e-05, | |
| "loss": 0.5085, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 14.285390713476783, | |
| "grad_norm": 1.225037693977356, | |
| "learning_rate": 9.627942293090357e-06, | |
| "loss": 0.5112, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 14.285390713476783, | |
| "eval_loss": 0.7822558283805847, | |
| "eval_runtime": 219.842, | |
| "eval_samples_per_second": 7.137, | |
| "eval_steps_per_second": 0.896, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 14.398640996602492, | |
| "grad_norm": 1.1970784664154053, | |
| "learning_rate": 8.109339407744875e-06, | |
| "loss": 0.5079, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 14.5118912797282, | |
| "grad_norm": 1.1259725093841553, | |
| "learning_rate": 6.590736522399393e-06, | |
| "loss": 0.5129, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 14.5118912797282, | |
| "eval_loss": 0.7796412110328674, | |
| "eval_runtime": 219.8188, | |
| "eval_samples_per_second": 7.138, | |
| "eval_steps_per_second": 0.896, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 14.625141562853907, | |
| "grad_norm": 1.236473798751831, | |
| "learning_rate": 5.072133637053911e-06, | |
| "loss": 0.5055, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 14.738391845979615, | |
| "grad_norm": 1.1682021617889404, | |
| "learning_rate": 3.553530751708428e-06, | |
| "loss": 0.5074, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 14.738391845979615, | |
| "eval_loss": 0.7759413719177246, | |
| "eval_runtime": 219.6677, | |
| "eval_samples_per_second": 7.143, | |
| "eval_steps_per_second": 0.897, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 14.851642129105322, | |
| "grad_norm": 1.190508484840393, | |
| "learning_rate": 2.0349278663629463e-06, | |
| "loss": 0.5103, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 14.96489241223103, | |
| "grad_norm": 1.18021559715271, | |
| "learning_rate": 5.163249810174639e-07, | |
| "loss": 0.5011, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 14.96489241223103, | |
| "eval_loss": 0.7753218412399292, | |
| "eval_runtime": 219.7467, | |
| "eval_samples_per_second": 7.14, | |
| "eval_steps_per_second": 0.896, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 15.079275198187995, | |
| "grad_norm": 1.3528636693954468, | |
| "learning_rate": 3.262518968133536e-05, | |
| "loss": 0.5176, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 15.192525481313703, | |
| "grad_norm": 1.3476513624191284, | |
| "learning_rate": 3.1360647445624685e-05, | |
| "loss": 0.5032, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 15.192525481313703, | |
| "eval_loss": 0.7792025804519653, | |
| "eval_runtime": 219.73, | |
| "eval_samples_per_second": 7.141, | |
| "eval_steps_per_second": 0.897, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 15.305775764439412, | |
| "grad_norm": 1.3222737312316895, | |
| "learning_rate": 3.009610520991401e-05, | |
| "loss": 0.5141, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 15.419026047565119, | |
| "grad_norm": 1.3413212299346924, | |
| "learning_rate": 2.883156297420334e-05, | |
| "loss": 0.5071, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 15.419026047565119, | |
| "eval_loss": 0.7755314707756042, | |
| "eval_runtime": 219.6961, | |
| "eval_samples_per_second": 7.142, | |
| "eval_steps_per_second": 0.897, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 15.532276330690827, | |
| "grad_norm": 1.4169390201568604, | |
| "learning_rate": 2.7567020738492665e-05, | |
| "loss": 0.5066, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 15.645526613816534, | |
| "grad_norm": 1.499665379524231, | |
| "learning_rate": 2.6302478502781997e-05, | |
| "loss": 0.5098, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 15.645526613816534, | |
| "eval_loss": 0.7675374150276184, | |
| "eval_runtime": 219.7136, | |
| "eval_samples_per_second": 7.141, | |
| "eval_steps_per_second": 0.897, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 15.758776896942242, | |
| "grad_norm": 1.408177137374878, | |
| "learning_rate": 2.5037936267071323e-05, | |
| "loss": 0.5074, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 15.87202718006795, | |
| "grad_norm": 1.5971038341522217, | |
| "learning_rate": 2.3773394031360648e-05, | |
| "loss": 0.4994, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 15.87202718006795, | |
| "eval_loss": 0.7616310715675354, | |
| "eval_runtime": 219.6635, | |
| "eval_samples_per_second": 7.143, | |
| "eval_steps_per_second": 0.897, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 15.985277463193658, | |
| "grad_norm": 1.4312022924423218, | |
| "learning_rate": 2.2508851795649977e-05, | |
| "loss": 0.5081, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 16.099660249150624, | |
| "grad_norm": 1.4189964532852173, | |
| "learning_rate": 2.1244309559939302e-05, | |
| "loss": 0.4831, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 16.099660249150624, | |
| "eval_loss": 0.758693277835846, | |
| "eval_runtime": 219.6153, | |
| "eval_samples_per_second": 7.144, | |
| "eval_steps_per_second": 0.897, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 16.212910532276332, | |
| "grad_norm": 1.429587960243225, | |
| "learning_rate": 1.9979767324228628e-05, | |
| "loss": 0.4677, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 16.326160815402037, | |
| "grad_norm": 1.5730829238891602, | |
| "learning_rate": 1.8715225088517957e-05, | |
| "loss": 0.4744, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 16.326160815402037, | |
| "eval_loss": 0.7522332668304443, | |
| "eval_runtime": 219.6797, | |
| "eval_samples_per_second": 7.142, | |
| "eval_steps_per_second": 0.897, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 16.439411098527746, | |
| "grad_norm": 1.3818005323410034, | |
| "learning_rate": 1.7450682852807286e-05, | |
| "loss": 0.4821, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 16.552661381653454, | |
| "grad_norm": 1.3803259134292603, | |
| "learning_rate": 1.618614061709661e-05, | |
| "loss": 0.4839, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 16.552661381653454, | |
| "eval_loss": 0.7453923225402832, | |
| "eval_runtime": 219.5972, | |
| "eval_samples_per_second": 7.145, | |
| "eval_steps_per_second": 0.897, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 16.665911664779163, | |
| "grad_norm": 1.4981536865234375, | |
| "learning_rate": 1.492159838138594e-05, | |
| "loss": 0.468, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 16.77916194790487, | |
| "grad_norm": 1.3549158573150635, | |
| "learning_rate": 1.3657056145675265e-05, | |
| "loss": 0.461, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 16.77916194790487, | |
| "eval_loss": 0.7414634823799133, | |
| "eval_runtime": 219.6011, | |
| "eval_samples_per_second": 7.145, | |
| "eval_steps_per_second": 0.897, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 16.892412231030576, | |
| "grad_norm": 1.4302562475204468, | |
| "learning_rate": 1.2392513909964594e-05, | |
| "loss": 0.477, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 17.006795016987542, | |
| "grad_norm": 1.2383838891983032, | |
| "learning_rate": 1.112797167425392e-05, | |
| "loss": 0.4667, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 17.006795016987542, | |
| "eval_loss": 0.7361006140708923, | |
| "eval_runtime": 219.5894, | |
| "eval_samples_per_second": 7.145, | |
| "eval_steps_per_second": 0.897, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 17.12004530011325, | |
| "grad_norm": 1.2482600212097168, | |
| "learning_rate": 9.863429438543249e-06, | |
| "loss": 0.4463, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 17.23329558323896, | |
| "grad_norm": 1.264907956123352, | |
| "learning_rate": 8.598887202832576e-06, | |
| "loss": 0.4479, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 17.23329558323896, | |
| "eval_loss": 0.7333863973617554, | |
| "eval_runtime": 219.7057, | |
| "eval_samples_per_second": 7.141, | |
| "eval_steps_per_second": 0.897, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 17.346545866364664, | |
| "grad_norm": 1.2122907638549805, | |
| "learning_rate": 7.334344967121902e-06, | |
| "loss": 0.4535, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 17.459796149490373, | |
| "grad_norm": 1.176712989807129, | |
| "learning_rate": 6.06980273141123e-06, | |
| "loss": 0.4404, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 17.459796149490373, | |
| "eval_loss": 0.7308885455131531, | |
| "eval_runtime": 219.6198, | |
| "eval_samples_per_second": 7.144, | |
| "eval_steps_per_second": 0.897, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 17.57304643261608, | |
| "grad_norm": 1.264377474784851, | |
| "learning_rate": 4.805260495700556e-06, | |
| "loss": 0.4425, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 17.68629671574179, | |
| "grad_norm": 1.3030773401260376, | |
| "learning_rate": 3.5407182599898835e-06, | |
| "loss": 0.4393, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 17.68629671574179, | |
| "eval_loss": 0.7286545634269714, | |
| "eval_runtime": 219.6474, | |
| "eval_samples_per_second": 7.143, | |
| "eval_steps_per_second": 0.897, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 17.7995469988675, | |
| "grad_norm": 1.362890601158142, | |
| "learning_rate": 2.276176024279211e-06, | |
| "loss": 0.4425, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 17.912797281993203, | |
| "grad_norm": 1.2483875751495361, | |
| "learning_rate": 1.0116337885685382e-06, | |
| "loss": 0.4434, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 17.912797281993203, | |
| "eval_loss": 0.7274926900863647, | |
| "eval_runtime": 219.7356, | |
| "eval_samples_per_second": 7.14, | |
| "eval_steps_per_second": 0.897, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 18.02718006795017, | |
| "grad_norm": 1.2370803356170654, | |
| "learning_rate": 1.9840728100113766e-05, | |
| "loss": 0.4474, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 18.140430351075878, | |
| "grad_norm": 1.454135537147522, | |
| "learning_rate": 1.8703071672354948e-05, | |
| "loss": 0.4294, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 18.140430351075878, | |
| "eval_loss": 0.7317793965339661, | |
| "eval_runtime": 216.9455, | |
| "eval_samples_per_second": 7.232, | |
| "eval_steps_per_second": 0.908, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 18.253680634201586, | |
| "grad_norm": 1.4219353199005127, | |
| "learning_rate": 1.7565415244596133e-05, | |
| "loss": 0.4362, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 18.366930917327295, | |
| "grad_norm": 1.4157588481903076, | |
| "learning_rate": 1.6427758816837314e-05, | |
| "loss": 0.4369, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 18.366930917327295, | |
| "eval_loss": 0.7285795211791992, | |
| "eval_runtime": 217.2426, | |
| "eval_samples_per_second": 7.222, | |
| "eval_steps_per_second": 0.907, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 18.480181200453, | |
| "grad_norm": 1.5712941884994507, | |
| "learning_rate": 1.52901023890785e-05, | |
| "loss": 0.4393, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 18.59343148357871, | |
| "grad_norm": 1.3149316310882568, | |
| "learning_rate": 1.4152445961319682e-05, | |
| "loss": 0.4377, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 18.59343148357871, | |
| "eval_loss": 0.7221394181251526, | |
| "eval_runtime": 217.271, | |
| "eval_samples_per_second": 7.221, | |
| "eval_steps_per_second": 0.907, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 18.706681766704417, | |
| "grad_norm": 1.4053345918655396, | |
| "learning_rate": 1.3014789533560864e-05, | |
| "loss": 0.4395, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 18.819932049830125, | |
| "grad_norm": 1.4755219221115112, | |
| "learning_rate": 1.1877133105802047e-05, | |
| "loss": 0.4464, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 18.819932049830125, | |
| "eval_loss": 0.7166544795036316, | |
| "eval_runtime": 217.3739, | |
| "eval_samples_per_second": 7.218, | |
| "eval_steps_per_second": 0.906, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 18.933182332955834, | |
| "grad_norm": 1.3762329816818237, | |
| "learning_rate": 1.073947667804323e-05, | |
| "loss": 0.4482, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 19.047565118912797, | |
| "grad_norm": 1.1810795068740845, | |
| "learning_rate": 9.62457337883959e-06, | |
| "loss": 0.4264, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 19.047565118912797, | |
| "eval_loss": 0.7138365507125854, | |
| "eval_runtime": 217.4069, | |
| "eval_samples_per_second": 7.217, | |
| "eval_steps_per_second": 0.906, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 19.160815402038505, | |
| "grad_norm": 1.51250422000885, | |
| "learning_rate": 8.486916951080774e-06, | |
| "loss": 0.4197, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 19.274065685164214, | |
| "grad_norm": 1.3608779907226562, | |
| "learning_rate": 7.349260523321957e-06, | |
| "loss": 0.4149, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 19.274065685164214, | |
| "eval_loss": 0.7112516760826111, | |
| "eval_runtime": 217.3073, | |
| "eval_samples_per_second": 7.22, | |
| "eval_steps_per_second": 0.907, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 19.387315968289922, | |
| "grad_norm": 1.3459504842758179, | |
| "learning_rate": 6.21160409556314e-06, | |
| "loss": 0.4151, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 19.500566251415627, | |
| "grad_norm": 1.270430326461792, | |
| "learning_rate": 5.073947667804323e-06, | |
| "loss": 0.4107, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 19.500566251415627, | |
| "eval_loss": 0.7087224721908569, | |
| "eval_runtime": 217.4347, | |
| "eval_samples_per_second": 7.216, | |
| "eval_steps_per_second": 0.906, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 19.613816534541336, | |
| "grad_norm": 1.147330641746521, | |
| "learning_rate": 3.936291240045506e-06, | |
| "loss": 0.4204, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 19.727066817667044, | |
| "grad_norm": 1.3679783344268799, | |
| "learning_rate": 2.8213879408418657e-06, | |
| "loss": 0.4241, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 19.727066817667044, | |
| "eval_loss": 0.705489456653595, | |
| "eval_runtime": 217.5656, | |
| "eval_samples_per_second": 7.212, | |
| "eval_steps_per_second": 0.905, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 19.840317100792753, | |
| "grad_norm": 1.2595313787460327, | |
| "learning_rate": 1.6837315130830492e-06, | |
| "loss": 0.4157, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 19.95356738391846, | |
| "grad_norm": 1.3279147148132324, | |
| "learning_rate": 5.460750853242321e-07, | |
| "loss": 0.4127, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 19.95356738391846, | |
| "eval_loss": 0.7047748565673828, | |
| "eval_runtime": 217.3024, | |
| "eval_samples_per_second": 7.22, | |
| "eval_steps_per_second": 0.907, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 20.067950169875424, | |
| "grad_norm": 1.4412195682525635, | |
| "learning_rate": 2.5650153268070802e-05, | |
| "loss": 0.4066, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 20.181200453001132, | |
| "grad_norm": 1.591495156288147, | |
| "learning_rate": 2.466132700484525e-05, | |
| "loss": 0.4107, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 20.181200453001132, | |
| "eval_loss": 0.7136498093605042, | |
| "eval_runtime": 217.2853, | |
| "eval_samples_per_second": 7.221, | |
| "eval_steps_per_second": 0.907, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 20.29445073612684, | |
| "grad_norm": 1.5843544006347656, | |
| "learning_rate": 2.3672500741619698e-05, | |
| "loss": 0.4249, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 20.40770101925255, | |
| "grad_norm": 1.7842884063720703, | |
| "learning_rate": 2.268367447839415e-05, | |
| "loss": 0.4292, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 20.40770101925255, | |
| "eval_loss": 0.7056994438171387, | |
| "eval_runtime": 217.247, | |
| "eval_samples_per_second": 7.222, | |
| "eval_steps_per_second": 0.907, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 20.520951302378258, | |
| "grad_norm": 1.8154791593551636, | |
| "learning_rate": 2.1694848215168594e-05, | |
| "loss": 0.4171, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 20.634201585503963, | |
| "grad_norm": 1.810947060585022, | |
| "learning_rate": 2.0706021951943045e-05, | |
| "loss": 0.4254, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 20.634201585503963, | |
| "eval_loss": 0.7009139060974121, | |
| "eval_runtime": 217.3713, | |
| "eval_samples_per_second": 7.218, | |
| "eval_steps_per_second": 0.906, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 20.74745186862967, | |
| "grad_norm": 1.6273292303085327, | |
| "learning_rate": 1.9717195688717493e-05, | |
| "loss": 0.4182, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 20.86070215175538, | |
| "grad_norm": 1.865356206893921, | |
| "learning_rate": 1.8728369425491945e-05, | |
| "loss": 0.4143, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 20.86070215175538, | |
| "eval_loss": 0.6963376402854919, | |
| "eval_runtime": 217.269, | |
| "eval_samples_per_second": 7.221, | |
| "eval_steps_per_second": 0.907, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 20.973952434881088, | |
| "grad_norm": 2.093496799468994, | |
| "learning_rate": 1.773954316226639e-05, | |
| "loss": 0.4116, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 21.08833522083805, | |
| "grad_norm": 1.6501940488815308, | |
| "learning_rate": 1.6750716899040837e-05, | |
| "loss": 0.4071, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 21.08833522083805, | |
| "eval_loss": 0.6935945153236389, | |
| "eval_runtime": 217.1749, | |
| "eval_samples_per_second": 7.225, | |
| "eval_steps_per_second": 0.907, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 21.20158550396376, | |
| "grad_norm": 1.4257782697677612, | |
| "learning_rate": 1.576189063581529e-05, | |
| "loss": 0.3964, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 21.314835787089468, | |
| "grad_norm": 1.7246989011764526, | |
| "learning_rate": 1.4773064372589737e-05, | |
| "loss": 0.3856, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 21.314835787089468, | |
| "eval_loss": 0.6908048391342163, | |
| "eval_runtime": 217.1554, | |
| "eval_samples_per_second": 7.225, | |
| "eval_steps_per_second": 0.907, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 21.428086070215176, | |
| "grad_norm": 1.5051772594451904, | |
| "learning_rate": 1.3784238109364186e-05, | |
| "loss": 0.3903, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 21.541336353340885, | |
| "grad_norm": 1.4209738969802856, | |
| "learning_rate": 1.2795411846138633e-05, | |
| "loss": 0.3992, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 21.541336353340885, | |
| "eval_loss": 0.6845880746841431, | |
| "eval_runtime": 217.0726, | |
| "eval_samples_per_second": 7.228, | |
| "eval_steps_per_second": 0.908, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 21.65458663646659, | |
| "grad_norm": 1.4793322086334229, | |
| "learning_rate": 1.1806585582913082e-05, | |
| "loss": 0.392, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 21.7678369195923, | |
| "grad_norm": 1.5042359828948975, | |
| "learning_rate": 1.0817759319687532e-05, | |
| "loss": 0.3833, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 21.7678369195923, | |
| "eval_loss": 0.6798712611198425, | |
| "eval_runtime": 217.2033, | |
| "eval_samples_per_second": 7.224, | |
| "eval_steps_per_second": 0.907, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 21.881087202718007, | |
| "grad_norm": 1.4992612600326538, | |
| "learning_rate": 9.82893305646198e-06, | |
| "loss": 0.3912, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 21.994337485843715, | |
| "grad_norm": 1.4592713117599487, | |
| "learning_rate": 8.840106793236428e-06, | |
| "loss": 0.3931, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 21.994337485843715, | |
| "eval_loss": 0.6735964417457581, | |
| "eval_runtime": 217.2254, | |
| "eval_samples_per_second": 7.223, | |
| "eval_steps_per_second": 0.907, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 22.108720271800678, | |
| "grad_norm": 1.3605159521102905, | |
| "learning_rate": 7.851280530010878e-06, | |
| "loss": 0.378, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 22.221970554926386, | |
| "grad_norm": 1.4335530996322632, | |
| "learning_rate": 6.862454266785326e-06, | |
| "loss": 0.379, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 22.221970554926386, | |
| "eval_loss": 0.6728909015655518, | |
| "eval_runtime": 217.1793, | |
| "eval_samples_per_second": 7.224, | |
| "eval_steps_per_second": 0.907, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 22.335220838052095, | |
| "grad_norm": 1.2988905906677246, | |
| "learning_rate": 5.873628003559775e-06, | |
| "loss": 0.371, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 22.448471121177803, | |
| "grad_norm": 1.407586693763733, | |
| "learning_rate": 4.884801740334224e-06, | |
| "loss": 0.3719, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 22.448471121177803, | |
| "eval_loss": 0.670095682144165, | |
| "eval_runtime": 217.2529, | |
| "eval_samples_per_second": 7.222, | |
| "eval_steps_per_second": 0.907, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 22.561721404303512, | |
| "grad_norm": 1.5192447900772095, | |
| "learning_rate": 1.9627103228740338e-05, | |
| "loss": 0.3763, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 22.674971687429217, | |
| "grad_norm": 1.6283540725708008, | |
| "learning_rate": 1.8717598908594817e-05, | |
| "loss": 0.3834, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 22.674971687429217, | |
| "eval_loss": 0.6722336411476135, | |
| "eval_runtime": 219.0452, | |
| "eval_samples_per_second": 7.163, | |
| "eval_steps_per_second": 0.899, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 22.788221970554925, | |
| "grad_norm": 1.8841089010238647, | |
| "learning_rate": 1.7808094588449296e-05, | |
| "loss": 0.3766, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 22.901472253680634, | |
| "grad_norm": 1.6647872924804688, | |
| "learning_rate": 1.6898590268303775e-05, | |
| "loss": 0.379, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 22.901472253680634, | |
| "eval_loss": 0.6667923331260681, | |
| "eval_runtime": 219.0896, | |
| "eval_samples_per_second": 7.161, | |
| "eval_steps_per_second": 0.899, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 23.0158550396376, | |
| "grad_norm": 1.56221604347229, | |
| "learning_rate": 1.5989085948158254e-05, | |
| "loss": 0.3724, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 23.12910532276331, | |
| "grad_norm": 1.741861343383789, | |
| "learning_rate": 1.5079581628012735e-05, | |
| "loss": 0.3648, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 23.12910532276331, | |
| "eval_loss": 0.6666680574417114, | |
| "eval_runtime": 219.1233, | |
| "eval_samples_per_second": 7.16, | |
| "eval_steps_per_second": 0.899, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 23.242355605889014, | |
| "grad_norm": 1.4197698831558228, | |
| "learning_rate": 1.4170077307867214e-05, | |
| "loss": 0.3622, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 23.355605889014722, | |
| "grad_norm": 1.5689094066619873, | |
| "learning_rate": 1.3260572987721692e-05, | |
| "loss": 0.3633, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 23.355605889014722, | |
| "eval_loss": 0.6614246368408203, | |
| "eval_runtime": 219.1161, | |
| "eval_samples_per_second": 7.161, | |
| "eval_steps_per_second": 0.899, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 23.46885617214043, | |
| "grad_norm": 1.73819899559021, | |
| "learning_rate": 1.2351068667576171e-05, | |
| "loss": 0.3665, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 23.58210645526614, | |
| "grad_norm": 1.470841884613037, | |
| "learning_rate": 1.1441564347430652e-05, | |
| "loss": 0.3594, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 23.58210645526614, | |
| "eval_loss": 0.6564630270004272, | |
| "eval_runtime": 219.1222, | |
| "eval_samples_per_second": 7.16, | |
| "eval_steps_per_second": 0.899, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 23.695356738391848, | |
| "grad_norm": 1.4712560176849365, | |
| "learning_rate": 1.0532060027285131e-05, | |
| "loss": 0.3567, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 23.808607021517552, | |
| "grad_norm": 1.3822436332702637, | |
| "learning_rate": 9.622555707139608e-06, | |
| "loss": 0.3655, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 23.808607021517552, | |
| "eval_loss": 0.6519103050231934, | |
| "eval_runtime": 219.1979, | |
| "eval_samples_per_second": 7.158, | |
| "eval_steps_per_second": 0.899, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 23.92185730464326, | |
| "grad_norm": 1.49004065990448, | |
| "learning_rate": 8.713051386994087e-06, | |
| "loss": 0.356, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 24.036240090600227, | |
| "grad_norm": 1.3333971500396729, | |
| "learning_rate": 7.803547066848568e-06, | |
| "loss": 0.3609, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 24.036240090600227, | |
| "eval_loss": 0.6471272706985474, | |
| "eval_runtime": 219.2457, | |
| "eval_samples_per_second": 7.156, | |
| "eval_steps_per_second": 0.899, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 24.149490373725936, | |
| "grad_norm": 1.3648090362548828, | |
| "learning_rate": 6.894042746703047e-06, | |
| "loss": 0.3445, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 24.26274065685164, | |
| "grad_norm": 1.2211579084396362, | |
| "learning_rate": 5.984538426557527e-06, | |
| "loss": 0.3438, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 24.26274065685164, | |
| "eval_loss": 0.6461014151573181, | |
| "eval_runtime": 219.1852, | |
| "eval_samples_per_second": 7.158, | |
| "eval_steps_per_second": 0.899, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 24.37599093997735, | |
| "grad_norm": 1.332571029663086, | |
| "learning_rate": 5.075034106412006e-06, | |
| "loss": 0.3378, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 24.489241223103058, | |
| "grad_norm": 1.263708233833313, | |
| "learning_rate": 4.1655297862664855e-06, | |
| "loss": 0.3457, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 24.489241223103058, | |
| "eval_loss": 0.6429575681686401, | |
| "eval_runtime": 219.1956, | |
| "eval_samples_per_second": 7.158, | |
| "eval_steps_per_second": 0.899, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 24.602491506228766, | |
| "grad_norm": 1.2414239645004272, | |
| "learning_rate": 3.256025466120964e-06, | |
| "loss": 0.3478, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 24.715741789354475, | |
| "grad_norm": 1.183813214302063, | |
| "learning_rate": 2.3465211459754434e-06, | |
| "loss": 0.3413, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 24.715741789354475, | |
| "eval_loss": 0.6409078240394592, | |
| "eval_runtime": 219.161, | |
| "eval_samples_per_second": 7.159, | |
| "eval_steps_per_second": 0.899, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 24.82899207248018, | |
| "grad_norm": 1.3728307485580444, | |
| "learning_rate": 1.4370168258299228e-06, | |
| "loss": 0.3453, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 24.942242355605888, | |
| "grad_norm": 1.182039499282837, | |
| "learning_rate": 5.275125056844021e-07, | |
| "loss": 0.3439, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 24.942242355605888, | |
| "eval_loss": 0.6399772763252258, | |
| "eval_runtime": 219.2001, | |
| "eval_samples_per_second": 7.158, | |
| "eval_steps_per_second": 0.899, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 25.056625141562854, | |
| "grad_norm": 1.50559401512146, | |
| "learning_rate": 1.4498610760293005e-05, | |
| "loss": 0.3502, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 25.169875424688563, | |
| "grad_norm": 1.501145839691162, | |
| "learning_rate": 1.3656647301507115e-05, | |
| "loss": 0.3373, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 25.169875424688563, | |
| "eval_loss": 0.6438981294631958, | |
| "eval_runtime": 219.2088, | |
| "eval_samples_per_second": 7.158, | |
| "eval_steps_per_second": 0.899, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 25.28312570781427, | |
| "grad_norm": 2.4662117958068848, | |
| "learning_rate": 1.2814683842721226e-05, | |
| "loss": 0.3419, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 25.396375990939976, | |
| "grad_norm": 1.5162239074707031, | |
| "learning_rate": 1.1972720383935337e-05, | |
| "loss": 0.3452, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 25.396375990939976, | |
| "eval_loss": 0.6388878226280212, | |
| "eval_runtime": 219.2066, | |
| "eval_samples_per_second": 7.158, | |
| "eval_steps_per_second": 0.899, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 25.509626274065685, | |
| "grad_norm": 1.315088152885437, | |
| "learning_rate": 1.113075692514945e-05, | |
| "loss": 0.3487, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 25.622876557191393, | |
| "grad_norm": 1.4352425336837769, | |
| "learning_rate": 1.028879346636356e-05, | |
| "loss": 0.3386, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 25.622876557191393, | |
| "eval_loss": 0.6349427700042725, | |
| "eval_runtime": 219.2727, | |
| "eval_samples_per_second": 7.155, | |
| "eval_steps_per_second": 0.898, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 25.7361268403171, | |
| "grad_norm": 1.433242678642273, | |
| "learning_rate": 9.446830007577671e-06, | |
| "loss": 0.3365, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 25.84937712344281, | |
| "grad_norm": 1.343719720840454, | |
| "learning_rate": 8.604866548791782e-06, | |
| "loss": 0.3409, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 25.84937712344281, | |
| "eval_loss": 0.631538987159729, | |
| "eval_runtime": 219.1534, | |
| "eval_samples_per_second": 7.159, | |
| "eval_steps_per_second": 0.899, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 25.962627406568515, | |
| "grad_norm": 1.496169090270996, | |
| "learning_rate": 7.762903090005893e-06, | |
| "loss": 0.3316, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 26.07701019252548, | |
| "grad_norm": 1.5395649671554565, | |
| "learning_rate": 6.920939631220005e-06, | |
| "loss": 0.3429, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 26.07701019252548, | |
| "eval_loss": 0.6306207180023193, | |
| "eval_runtime": 219.1057, | |
| "eval_samples_per_second": 7.161, | |
| "eval_steps_per_second": 0.899, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 26.19026047565119, | |
| "grad_norm": 1.298531413078308, | |
| "learning_rate": 6.078976172434116e-06, | |
| "loss": 0.3274, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 26.3035107587769, | |
| "grad_norm": 1.3206506967544556, | |
| "learning_rate": 5.237012713648228e-06, | |
| "loss": 0.3281, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 26.3035107587769, | |
| "eval_loss": 0.6274815797805786, | |
| "eval_runtime": 219.1686, | |
| "eval_samples_per_second": 7.159, | |
| "eval_steps_per_second": 0.899, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 26.416761041902603, | |
| "grad_norm": 1.3031998872756958, | |
| "learning_rate": 4.395049254862339e-06, | |
| "loss": 0.3186, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 26.530011325028312, | |
| "grad_norm": 1.232765555381775, | |
| "learning_rate": 3.5530857960764503e-06, | |
| "loss": 0.324, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 26.530011325028312, | |
| "eval_loss": 0.6246664524078369, | |
| "eval_runtime": 219.2008, | |
| "eval_samples_per_second": 7.158, | |
| "eval_steps_per_second": 0.899, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 26.64326160815402, | |
| "grad_norm": 1.3108420372009277, | |
| "learning_rate": 2.7111223372905617e-06, | |
| "loss": 0.3265, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 26.75651189127973, | |
| "grad_norm": 1.2938895225524902, | |
| "learning_rate": 1.8691588785046728e-06, | |
| "loss": 0.325, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 26.75651189127973, | |
| "eval_loss": 0.6232734322547913, | |
| "eval_runtime": 219.1916, | |
| "eval_samples_per_second": 7.158, | |
| "eval_steps_per_second": 0.899, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 26.869762174405437, | |
| "grad_norm": 1.4028679132461548, | |
| "learning_rate": 1.0271954197187842e-06, | |
| "loss": 0.3177, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 26.983012457531142, | |
| "grad_norm": 1.1903717517852783, | |
| "learning_rate": 1.8523196093289553e-07, | |
| "loss": 0.3282, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 26.983012457531142, | |
| "eval_loss": 0.6224809885025024, | |
| "eval_runtime": 219.1746, | |
| "eval_samples_per_second": 7.159, | |
| "eval_steps_per_second": 0.899, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 27.09739524348811, | |
| "grad_norm": 1.5629881620407104, | |
| "learning_rate": 1.9454545454545457e-05, | |
| "loss": 0.322, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 27.210645526613817, | |
| "grad_norm": 1.8830535411834717, | |
| "learning_rate": 1.86969696969697e-05, | |
| "loss": 0.3272, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 27.210645526613817, | |
| "eval_loss": 0.6294634342193604, | |
| "eval_runtime": 216.3539, | |
| "eval_samples_per_second": 7.252, | |
| "eval_steps_per_second": 0.911, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 27.323895809739525, | |
| "grad_norm": 2.302112340927124, | |
| "learning_rate": 1.793939393939394e-05, | |
| "loss": 0.3343, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 27.43714609286523, | |
| "grad_norm": 1.6443369388580322, | |
| "learning_rate": 1.718181818181818e-05, | |
| "loss": 0.3331, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 27.43714609286523, | |
| "eval_loss": 0.6251102685928345, | |
| "eval_runtime": 216.4341, | |
| "eval_samples_per_second": 7.249, | |
| "eval_steps_per_second": 0.91, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 27.55039637599094, | |
| "grad_norm": 1.6903585195541382, | |
| "learning_rate": 1.6424242424242424e-05, | |
| "loss": 0.3338, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 27.663646659116647, | |
| "grad_norm": 1.6333993673324585, | |
| "learning_rate": 1.5666666666666667e-05, | |
| "loss": 0.3293, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 27.663646659116647, | |
| "eval_loss": 0.6229289174079895, | |
| "eval_runtime": 216.5229, | |
| "eval_samples_per_second": 7.246, | |
| "eval_steps_per_second": 0.91, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 27.776896942242356, | |
| "grad_norm": 1.7001616954803467, | |
| "learning_rate": 1.4909090909090908e-05, | |
| "loss": 0.3245, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 27.890147225368064, | |
| "grad_norm": 1.919396162033081, | |
| "learning_rate": 1.4151515151515152e-05, | |
| "loss": 0.3284, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 27.890147225368064, | |
| "eval_loss": 0.6198203563690186, | |
| "eval_runtime": 216.5445, | |
| "eval_samples_per_second": 7.246, | |
| "eval_steps_per_second": 0.91, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 28.004530011325027, | |
| "grad_norm": 2.137244462966919, | |
| "learning_rate": 1.3393939393939395e-05, | |
| "loss": 0.3363, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 28.117780294450736, | |
| "grad_norm": 2.0852112770080566, | |
| "learning_rate": 1.2636363636363638e-05, | |
| "loss": 0.31, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 28.117780294450736, | |
| "eval_loss": 0.6158381104469299, | |
| "eval_runtime": 216.498, | |
| "eval_samples_per_second": 7.247, | |
| "eval_steps_per_second": 0.91, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 28.231030577576444, | |
| "grad_norm": 1.7770031690597534, | |
| "learning_rate": 1.187878787878788e-05, | |
| "loss": 0.3094, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 28.344280860702153, | |
| "grad_norm": 2.2683119773864746, | |
| "learning_rate": 1.1136363636363637e-05, | |
| "loss": 0.3106, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 28.344280860702153, | |
| "eval_loss": 0.6127829551696777, | |
| "eval_runtime": 216.6692, | |
| "eval_samples_per_second": 7.241, | |
| "eval_steps_per_second": 0.909, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 28.45753114382786, | |
| "grad_norm": 2.2971391677856445, | |
| "learning_rate": 1.037878787878788e-05, | |
| "loss": 0.3123, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 28.570781426953566, | |
| "grad_norm": 1.5072888135910034, | |
| "learning_rate": 9.62121212121212e-06, | |
| "loss": 0.3126, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 28.570781426953566, | |
| "eval_loss": 0.6085474491119385, | |
| "eval_runtime": 216.6624, | |
| "eval_samples_per_second": 7.242, | |
| "eval_steps_per_second": 0.909, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 28.684031710079275, | |
| "grad_norm": 1.9965884685516357, | |
| "learning_rate": 8.863636363636365e-06, | |
| "loss": 0.3159, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 28.797281993204983, | |
| "grad_norm": 1.9271585941314697, | |
| "learning_rate": 8.106060606060606e-06, | |
| "loss": 0.317, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 28.797281993204983, | |
| "eval_loss": 0.6035783886909485, | |
| "eval_runtime": 216.6906, | |
| "eval_samples_per_second": 7.241, | |
| "eval_steps_per_second": 0.909, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 28.91053227633069, | |
| "grad_norm": 1.6005176305770874, | |
| "learning_rate": 7.3484848484848486e-06, | |
| "loss": 0.3063, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 29.024915062287654, | |
| "grad_norm": 1.3837414979934692, | |
| "learning_rate": 6.59090909090909e-06, | |
| "loss": 0.3149, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 29.024915062287654, | |
| "eval_loss": 0.6015561819076538, | |
| "eval_runtime": 216.6327, | |
| "eval_samples_per_second": 7.243, | |
| "eval_steps_per_second": 0.909, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 29.138165345413363, | |
| "grad_norm": 1.3116227388381958, | |
| "learning_rate": 5.833333333333334e-06, | |
| "loss": 0.2962, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 29.25141562853907, | |
| "grad_norm": 1.3354703187942505, | |
| "learning_rate": 5.075757575757576e-06, | |
| "loss": 0.2966, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 29.25141562853907, | |
| "eval_loss": 0.5984891653060913, | |
| "eval_runtime": 216.6939, | |
| "eval_samples_per_second": 7.241, | |
| "eval_steps_per_second": 0.909, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 29.36466591166478, | |
| "grad_norm": 1.1777273416519165, | |
| "learning_rate": 4.3181818181818185e-06, | |
| "loss": 0.2989, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 29.477916194790488, | |
| "grad_norm": 1.9163764715194702, | |
| "learning_rate": 3.5606060606060608e-06, | |
| "loss": 0.3063, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 29.477916194790488, | |
| "eval_loss": 0.5958673357963562, | |
| "eval_runtime": 216.5489, | |
| "eval_samples_per_second": 7.245, | |
| "eval_steps_per_second": 0.91, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 29.591166477916193, | |
| "grad_norm": 1.3537064790725708, | |
| "learning_rate": 2.803030303030303e-06, | |
| "loss": 0.2951, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 29.7044167610419, | |
| "grad_norm": 1.3078798055648804, | |
| "learning_rate": 2.0454545454545457e-06, | |
| "loss": 0.2963, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 29.7044167610419, | |
| "eval_loss": 0.5946142077445984, | |
| "eval_runtime": 216.5944, | |
| "eval_samples_per_second": 7.244, | |
| "eval_steps_per_second": 0.91, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 29.81766704416761, | |
| "grad_norm": 1.289014458656311, | |
| "learning_rate": 1.287878787878788e-06, | |
| "loss": 0.2959, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 29.93091732729332, | |
| "grad_norm": 1.3634095191955566, | |
| "learning_rate": 5.303030303030304e-07, | |
| "loss": 0.2942, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 29.93091732729332, | |
| "eval_loss": 0.5935017466545105, | |
| "eval_runtime": 216.5971, | |
| "eval_samples_per_second": 7.244, | |
| "eval_steps_per_second": 0.91, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 30.045300113250285, | |
| "grad_norm": 1.3688397407531738, | |
| "learning_rate": 1.8012807271224955e-05, | |
| "loss": 0.2984, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 30.15855039637599, | |
| "grad_norm": 1.7011109590530396, | |
| "learning_rate": 1.7324244302141433e-05, | |
| "loss": 0.2943, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 30.15855039637599, | |
| "eval_loss": 0.6008950471878052, | |
| "eval_runtime": 216.6155, | |
| "eval_samples_per_second": 7.243, | |
| "eval_steps_per_second": 0.909, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 30.2718006795017, | |
| "grad_norm": 1.7913622856140137, | |
| "learning_rate": 1.663568133305791e-05, | |
| "loss": 0.2931, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 30.385050962627407, | |
| "grad_norm": 1.8850469589233398, | |
| "learning_rate": 1.5947118363974385e-05, | |
| "loss": 0.305, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 30.385050962627407, | |
| "eval_loss": 0.5989060997962952, | |
| "eval_runtime": 216.6435, | |
| "eval_samples_per_second": 7.242, | |
| "eval_steps_per_second": 0.909, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 30.498301245753115, | |
| "grad_norm": 1.997729778289795, | |
| "learning_rate": 1.5258555394890863e-05, | |
| "loss": 0.3083, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 30.611551528878824, | |
| "grad_norm": 1.8760637044906616, | |
| "learning_rate": 1.4569992425807341e-05, | |
| "loss": 0.3006, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 30.611551528878824, | |
| "eval_loss": 0.596034824848175, | |
| "eval_runtime": 216.6232, | |
| "eval_samples_per_second": 7.243, | |
| "eval_steps_per_second": 0.909, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 30.72480181200453, | |
| "grad_norm": 1.8395705223083496, | |
| "learning_rate": 1.388142945672382e-05, | |
| "loss": 0.3057, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 30.838052095130237, | |
| "grad_norm": 1.8442955017089844, | |
| "learning_rate": 1.3192866487640296e-05, | |
| "loss": 0.3038, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 30.838052095130237, | |
| "eval_loss": 0.5910864472389221, | |
| "eval_runtime": 216.6693, | |
| "eval_samples_per_second": 7.241, | |
| "eval_steps_per_second": 0.909, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 30.951302378255946, | |
| "grad_norm": 1.9047316312789917, | |
| "learning_rate": 1.2504303518556774e-05, | |
| "loss": 0.2949, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 31.065685164212912, | |
| "grad_norm": 1.8259665966033936, | |
| "learning_rate": 1.181574054947325e-05, | |
| "loss": 0.2984, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 31.065685164212912, | |
| "eval_loss": 0.5886039733886719, | |
| "eval_runtime": 216.6071, | |
| "eval_samples_per_second": 7.244, | |
| "eval_steps_per_second": 0.909, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 31.178935447338617, | |
| "grad_norm": 1.9464973211288452, | |
| "learning_rate": 1.1127177580389728e-05, | |
| "loss": 0.2915, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 31.292185730464325, | |
| "grad_norm": 1.4512701034545898, | |
| "learning_rate": 1.0438614611306204e-05, | |
| "loss": 0.2865, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 31.292185730464325, | |
| "eval_loss": 0.5855095982551575, | |
| "eval_runtime": 216.5764, | |
| "eval_samples_per_second": 7.245, | |
| "eval_steps_per_second": 0.91, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 31.405436013590034, | |
| "grad_norm": 1.6476430892944336, | |
| "learning_rate": 9.750051642222682e-06, | |
| "loss": 0.2794, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 31.518686296715742, | |
| "grad_norm": 2.3963589668273926, | |
| "learning_rate": 9.06148867313916e-06, | |
| "loss": 0.2958, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 31.518686296715742, | |
| "eval_loss": 0.5817484259605408, | |
| "eval_runtime": 216.6161, | |
| "eval_samples_per_second": 7.243, | |
| "eval_steps_per_second": 0.909, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 31.63193657984145, | |
| "grad_norm": 1.6295278072357178, | |
| "learning_rate": 8.372925704055636e-06, | |
| "loss": 0.2842, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 31.745186862967156, | |
| "grad_norm": 1.7011767625808716, | |
| "learning_rate": 7.684362734972115e-06, | |
| "loss": 0.2853, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 31.745186862967156, | |
| "eval_loss": 0.5777027010917664, | |
| "eval_runtime": 216.6548, | |
| "eval_samples_per_second": 7.242, | |
| "eval_steps_per_second": 0.909, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 31.858437146092864, | |
| "grad_norm": 1.2951115369796753, | |
| "learning_rate": 6.995799765888592e-06, | |
| "loss": 0.2822, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 31.971687429218573, | |
| "grad_norm": 1.6724634170532227, | |
| "learning_rate": 6.307236796805067e-06, | |
| "loss": 0.282, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 31.971687429218573, | |
| "eval_loss": 0.5746533274650574, | |
| "eval_runtime": 216.6678, | |
| "eval_samples_per_second": 7.242, | |
| "eval_steps_per_second": 0.909, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 32.08607021517554, | |
| "grad_norm": 1.1698694229125977, | |
| "learning_rate": 5.618673827721545e-06, | |
| "loss": 0.2858, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 32.19932049830125, | |
| "grad_norm": 1.4823814630508423, | |
| "learning_rate": 4.930110858638023e-06, | |
| "loss": 0.2724, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 32.19932049830125, | |
| "eval_loss": 0.5733225345611572, | |
| "eval_runtime": 216.6993, | |
| "eval_samples_per_second": 7.24, | |
| "eval_steps_per_second": 0.909, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 32.312570781426956, | |
| "grad_norm": 1.2654746770858765, | |
| "learning_rate": 4.2415478895545e-06, | |
| "loss": 0.2668, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 32.425821064552665, | |
| "grad_norm": 1.390316367149353, | |
| "learning_rate": 3.5529849204709775e-06, | |
| "loss": 0.2732, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 32.425821064552665, | |
| "eval_loss": 0.5705001354217529, | |
| "eval_runtime": 216.6456, | |
| "eval_samples_per_second": 7.242, | |
| "eval_steps_per_second": 0.909, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 32.539071347678366, | |
| "grad_norm": 1.0841820240020752, | |
| "learning_rate": 2.864421951387454e-06, | |
| "loss": 0.2757, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 32.652321630804074, | |
| "grad_norm": 1.1355277299880981, | |
| "learning_rate": 2.175858982303932e-06, | |
| "loss": 0.2726, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 32.652321630804074, | |
| "eval_loss": 0.5688679814338684, | |
| "eval_runtime": 216.6214, | |
| "eval_samples_per_second": 7.243, | |
| "eval_steps_per_second": 0.909, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 32.76557191392978, | |
| "grad_norm": 1.3108878135681152, | |
| "learning_rate": 1.4872960132204092e-06, | |
| "loss": 0.273, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 32.87882219705549, | |
| "grad_norm": 1.175482153892517, | |
| "learning_rate": 7.987330441368863e-07, | |
| "loss": 0.2695, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 32.87882219705549, | |
| "eval_loss": 0.567724347114563, | |
| "eval_runtime": 216.7209, | |
| "eval_samples_per_second": 7.24, | |
| "eval_steps_per_second": 0.909, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 32.9920724801812, | |
| "grad_norm": 1.3629848957061768, | |
| "learning_rate": 1.1017007505336364e-07, | |
| "loss": 0.2808, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 33.10645526613816, | |
| "grad_norm": 3.8983519077301025, | |
| "learning_rate": 5.299015897047691e-05, | |
| "loss": 0.2925, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 33.10645526613816, | |
| "eval_loss": 0.6180706024169922, | |
| "eval_runtime": 217.7372, | |
| "eval_samples_per_second": 7.206, | |
| "eval_steps_per_second": 0.905, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 33.21970554926387, | |
| "grad_norm": 3.372912645339966, | |
| "learning_rate": 5.249558415341913e-05, | |
| "loss": 0.3275, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 33.33295583238958, | |
| "grad_norm": 3.959416389465332, | |
| "learning_rate": 5.199091597274792e-05, | |
| "loss": 0.341, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 33.33295583238958, | |
| "eval_loss": 0.6353843808174133, | |
| "eval_runtime": 217.8515, | |
| "eval_samples_per_second": 7.202, | |
| "eval_steps_per_second": 0.904, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 33.44620611551529, | |
| "grad_norm": 3.4942378997802734, | |
| "learning_rate": 5.1486247792076715e-05, | |
| "loss": 0.3485, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 33.559456398641, | |
| "grad_norm": 3.3839058876037598, | |
| "learning_rate": 5.098157961140551e-05, | |
| "loss": 0.3442, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 33.559456398641, | |
| "eval_loss": 0.6342476606369019, | |
| "eval_runtime": 217.9772, | |
| "eval_samples_per_second": 7.198, | |
| "eval_steps_per_second": 0.904, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 33.672706681766705, | |
| "grad_norm": 3.631831407546997, | |
| "learning_rate": 5.04769114307343e-05, | |
| "loss": 0.3505, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 33.785956964892414, | |
| "grad_norm": 3.05415678024292, | |
| "learning_rate": 4.9972243250063086e-05, | |
| "loss": 0.342, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 33.785956964892414, | |
| "eval_loss": 0.6282561421394348, | |
| "eval_runtime": 218.0099, | |
| "eval_samples_per_second": 7.197, | |
| "eval_steps_per_second": 0.904, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 33.89920724801812, | |
| "grad_norm": 3.213174343109131, | |
| "learning_rate": 4.946757506939187e-05, | |
| "loss": 0.3526, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 34.013590033975085, | |
| "grad_norm": 2.7019898891448975, | |
| "learning_rate": 4.8962906888720665e-05, | |
| "loss": 0.3596, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 34.013590033975085, | |
| "eval_loss": 0.6229637265205383, | |
| "eval_runtime": 217.9233, | |
| "eval_samples_per_second": 7.2, | |
| "eval_steps_per_second": 0.904, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 34.12684031710079, | |
| "grad_norm": 3.5813961029052734, | |
| "learning_rate": 4.845823870804946e-05, | |
| "loss": 0.3202, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 34.2400906002265, | |
| "grad_norm": 2.996546983718872, | |
| "learning_rate": 4.795357052737825e-05, | |
| "loss": 0.3208, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 34.2400906002265, | |
| "eval_loss": 0.6200416684150696, | |
| "eval_runtime": 217.9712, | |
| "eval_samples_per_second": 7.198, | |
| "eval_steps_per_second": 0.904, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 34.35334088335221, | |
| "grad_norm": 3.248931407928467, | |
| "learning_rate": 4.744890234670704e-05, | |
| "loss": 0.3191, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 34.46659116647792, | |
| "grad_norm": 2.503894805908203, | |
| "learning_rate": 4.6944234166035835e-05, | |
| "loss": 0.3206, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 34.46659116647792, | |
| "eval_loss": 0.6110924482345581, | |
| "eval_runtime": 217.9474, | |
| "eval_samples_per_second": 7.199, | |
| "eval_steps_per_second": 0.904, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 34.57984144960363, | |
| "grad_norm": 3.3107473850250244, | |
| "learning_rate": 4.643956598536463e-05, | |
| "loss": 0.3198, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 34.69309173272933, | |
| "grad_norm": 2.6435258388519287, | |
| "learning_rate": 4.5934897804693414e-05, | |
| "loss": 0.3261, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 34.69309173272933, | |
| "eval_loss": 0.603391706943512, | |
| "eval_runtime": 217.9424, | |
| "eval_samples_per_second": 7.199, | |
| "eval_steps_per_second": 0.904, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 34.80634201585504, | |
| "grad_norm": 3.1980810165405273, | |
| "learning_rate": 4.5430229624022207e-05, | |
| "loss": 0.3216, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 34.919592298980746, | |
| "grad_norm": 2.4994754791259766, | |
| "learning_rate": 4.4925561443351e-05, | |
| "loss": 0.3168, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 34.919592298980746, | |
| "eval_loss": 0.5940945148468018, | |
| "eval_runtime": 218.056, | |
| "eval_samples_per_second": 7.195, | |
| "eval_steps_per_second": 0.903, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 35.033975084937715, | |
| "grad_norm": 2.75138521194458, | |
| "learning_rate": 4.442089326267979e-05, | |
| "loss": 0.3191, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 35.14722536806342, | |
| "grad_norm": 3.1039974689483643, | |
| "learning_rate": 4.3916225082008585e-05, | |
| "loss": 0.296, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 35.14722536806342, | |
| "eval_loss": 0.5926975607872009, | |
| "eval_runtime": 218.0089, | |
| "eval_samples_per_second": 7.197, | |
| "eval_steps_per_second": 0.904, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 35.260475651189125, | |
| "grad_norm": 2.9686388969421387, | |
| "learning_rate": 4.341155690133738e-05, | |
| "loss": 0.2921, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 35.373725934314834, | |
| "grad_norm": 2.5670547485351562, | |
| "learning_rate": 4.290688872066616e-05, | |
| "loss": 0.2909, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 35.373725934314834, | |
| "eval_loss": 0.5892407894134521, | |
| "eval_runtime": 217.8483, | |
| "eval_samples_per_second": 7.202, | |
| "eval_steps_per_second": 0.904, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 35.48697621744054, | |
| "grad_norm": 2.28952956199646, | |
| "learning_rate": 4.2402220539994956e-05, | |
| "loss": 0.2947, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 35.60022650056625, | |
| "grad_norm": 2.401625394821167, | |
| "learning_rate": 4.189755235932374e-05, | |
| "loss": 0.2915, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 35.60022650056625, | |
| "eval_loss": 0.5815189480781555, | |
| "eval_runtime": 217.8623, | |
| "eval_samples_per_second": 7.202, | |
| "eval_steps_per_second": 0.904, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 35.71347678369196, | |
| "grad_norm": 2.7113890647888184, | |
| "learning_rate": 4.1392884178652534e-05, | |
| "loss": 0.2908, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 35.82672706681767, | |
| "grad_norm": 2.949303388595581, | |
| "learning_rate": 4.088821599798133e-05, | |
| "loss": 0.2942, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 35.82672706681767, | |
| "eval_loss": 0.5712306499481201, | |
| "eval_runtime": 217.792, | |
| "eval_samples_per_second": 7.204, | |
| "eval_steps_per_second": 0.905, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 35.939977349943376, | |
| "grad_norm": 2.3547251224517822, | |
| "learning_rate": 4.038354781731012e-05, | |
| "loss": 0.2854, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 36.05436013590034, | |
| "grad_norm": 2.6130595207214355, | |
| "learning_rate": 3.987887963663891e-05, | |
| "loss": 0.2877, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 36.05436013590034, | |
| "eval_loss": 0.5668493509292603, | |
| "eval_runtime": 217.7584, | |
| "eval_samples_per_second": 7.205, | |
| "eval_steps_per_second": 0.905, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 36.16761041902605, | |
| "grad_norm": 2.4720046520233154, | |
| "learning_rate": 3.9374211455967705e-05, | |
| "loss": 0.272, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 36.280860702151756, | |
| "grad_norm": 3.291337490081787, | |
| "learning_rate": 3.886954327529649e-05, | |
| "loss": 0.2756, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 36.280860702151756, | |
| "eval_loss": 0.5569508075714111, | |
| "eval_runtime": 217.7243, | |
| "eval_samples_per_second": 7.206, | |
| "eval_steps_per_second": 0.905, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 36.394110985277464, | |
| "grad_norm": 2.275122880935669, | |
| "learning_rate": 3.8364875094625284e-05, | |
| "loss": 0.2699, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 36.50736126840317, | |
| "grad_norm": 2.351252317428589, | |
| "learning_rate": 3.7860206913954076e-05, | |
| "loss": 0.263, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 36.50736126840317, | |
| "eval_loss": 0.552777886390686, | |
| "eval_runtime": 217.6644, | |
| "eval_samples_per_second": 7.208, | |
| "eval_steps_per_second": 0.905, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 36.62061155152888, | |
| "grad_norm": 2.0470945835113525, | |
| "learning_rate": 3.735553873328287e-05, | |
| "loss": 0.2605, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 36.73386183465459, | |
| "grad_norm": 2.258258819580078, | |
| "learning_rate": 3.685087055261166e-05, | |
| "loss": 0.2621, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 36.73386183465459, | |
| "eval_loss": 0.548316478729248, | |
| "eval_runtime": 217.7617, | |
| "eval_samples_per_second": 7.205, | |
| "eval_steps_per_second": 0.905, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 36.84711211778029, | |
| "grad_norm": 2.473788261413574, | |
| "learning_rate": 3.6346202371940454e-05, | |
| "loss": 0.2606, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 36.960362400906, | |
| "grad_norm": 2.4730281829833984, | |
| "learning_rate": 3.584153419126925e-05, | |
| "loss": 0.2674, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 36.960362400906, | |
| "eval_loss": 0.5399536490440369, | |
| "eval_runtime": 217.745, | |
| "eval_samples_per_second": 7.206, | |
| "eval_steps_per_second": 0.905, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 37.07474518686297, | |
| "grad_norm": 2.3119349479675293, | |
| "learning_rate": 3.533686601059803e-05, | |
| "loss": 0.258, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 37.18799546998868, | |
| "grad_norm": 2.451964855194092, | |
| "learning_rate": 3.4832197829926826e-05, | |
| "loss": 0.2452, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 37.18799546998868, | |
| "eval_loss": 0.5389652252197266, | |
| "eval_runtime": 217.7139, | |
| "eval_samples_per_second": 7.207, | |
| "eval_steps_per_second": 0.905, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 37.30124575311438, | |
| "grad_norm": 2.2861897945404053, | |
| "learning_rate": 3.432752964925562e-05, | |
| "loss": 0.2483, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 37.41449603624009, | |
| "grad_norm": 1.7861238718032837, | |
| "learning_rate": 3.3822861468584404e-05, | |
| "loss": 0.2493, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 37.41449603624009, | |
| "eval_loss": 0.5293774604797363, | |
| "eval_runtime": 217.8898, | |
| "eval_samples_per_second": 7.201, | |
| "eval_steps_per_second": 0.904, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 37.5277463193658, | |
| "grad_norm": 2.2910056114196777, | |
| "learning_rate": 3.33181932879132e-05, | |
| "loss": 0.2449, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 37.640996602491505, | |
| "grad_norm": 2.102193832397461, | |
| "learning_rate": 3.281352510724199e-05, | |
| "loss": 0.2398, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 37.640996602491505, | |
| "eval_loss": 0.5246281027793884, | |
| "eval_runtime": 217.7811, | |
| "eval_samples_per_second": 7.204, | |
| "eval_steps_per_second": 0.905, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 37.75424688561721, | |
| "grad_norm": 2.1423254013061523, | |
| "learning_rate": 3.230885692657078e-05, | |
| "loss": 0.2438, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 37.86749716874292, | |
| "grad_norm": 2.031027317047119, | |
| "learning_rate": 3.180418874589957e-05, | |
| "loss": 0.2427, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 37.86749716874292, | |
| "eval_loss": 0.5190041661262512, | |
| "eval_runtime": 217.6886, | |
| "eval_samples_per_second": 7.208, | |
| "eval_steps_per_second": 0.905, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 37.98074745186863, | |
| "grad_norm": 1.8530203104019165, | |
| "learning_rate": 3.129952056522836e-05, | |
| "loss": 0.2446, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 38.09513023782559, | |
| "grad_norm": 1.9591715335845947, | |
| "learning_rate": 3.0794852384557153e-05, | |
| "loss": 0.2288, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 38.09513023782559, | |
| "eval_loss": 0.5154264569282532, | |
| "eval_runtime": 217.6837, | |
| "eval_samples_per_second": 7.208, | |
| "eval_steps_per_second": 0.905, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 38.2083805209513, | |
| "grad_norm": 1.752700686454773, | |
| "learning_rate": 3.0290184203885946e-05, | |
| "loss": 0.2249, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 38.32163080407701, | |
| "grad_norm": 1.7865016460418701, | |
| "learning_rate": 2.978551602321474e-05, | |
| "loss": 0.2254, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 38.32163080407701, | |
| "eval_loss": 0.510138750076294, | |
| "eval_runtime": 217.7044, | |
| "eval_samples_per_second": 7.207, | |
| "eval_steps_per_second": 0.905, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 38.43488108720272, | |
| "grad_norm": 1.851835012435913, | |
| "learning_rate": 2.9280847842543528e-05, | |
| "loss": 0.2255, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 38.54813137032843, | |
| "grad_norm": 1.7320882081985474, | |
| "learning_rate": 2.877617966187232e-05, | |
| "loss": 0.227, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 38.54813137032843, | |
| "eval_loss": 0.5055522322654724, | |
| "eval_runtime": 217.6805, | |
| "eval_samples_per_second": 7.208, | |
| "eval_steps_per_second": 0.905, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 38.661381653454136, | |
| "grad_norm": 2.6240079402923584, | |
| "learning_rate": 2.8271511481201113e-05, | |
| "loss": 0.2227, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 38.774631936579844, | |
| "grad_norm": 1.8069425821304321, | |
| "learning_rate": 2.7766843300529906e-05, | |
| "loss": 0.223, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 38.774631936579844, | |
| "eval_loss": 0.49882474541664124, | |
| "eval_runtime": 217.6912, | |
| "eval_samples_per_second": 7.207, | |
| "eval_steps_per_second": 0.905, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 38.88788221970555, | |
| "grad_norm": 1.8260191679000854, | |
| "learning_rate": 2.7262175119858695e-05, | |
| "loss": 0.2239, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 39.002265005662515, | |
| "grad_norm": 5.091439723968506, | |
| "learning_rate": 2.6757506939187488e-05, | |
| "loss": 0.2295, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 39.002265005662515, | |
| "eval_loss": 0.49227145314216614, | |
| "eval_runtime": 217.7384, | |
| "eval_samples_per_second": 7.206, | |
| "eval_steps_per_second": 0.905, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 39.115515288788224, | |
| "grad_norm": 2.467454433441162, | |
| "learning_rate": 2.625283875851628e-05, | |
| "loss": 0.2057, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 39.22876557191393, | |
| "grad_norm": 1.6467406749725342, | |
| "learning_rate": 2.5748170577845067e-05, | |
| "loss": 0.2022, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 39.22876557191393, | |
| "eval_loss": 0.49300825595855713, | |
| "eval_runtime": 217.7288, | |
| "eval_samples_per_second": 7.206, | |
| "eval_steps_per_second": 0.905, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 39.34201585503964, | |
| "grad_norm": 1.446031093597412, | |
| "learning_rate": 2.5243502397173856e-05, | |
| "loss": 0.2055, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 39.45526613816534, | |
| "grad_norm": 1.6686514616012573, | |
| "learning_rate": 2.4738834216502652e-05, | |
| "loss": 0.2147, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 39.45526613816534, | |
| "eval_loss": 0.485858678817749, | |
| "eval_runtime": 217.207, | |
| "eval_samples_per_second": 7.224, | |
| "eval_steps_per_second": 0.907, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 39.56851642129105, | |
| "grad_norm": 1.513580322265625, | |
| "learning_rate": 2.4234166035831445e-05, | |
| "loss": 0.2046, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 39.68176670441676, | |
| "grad_norm": 1.5527840852737427, | |
| "learning_rate": 2.372949785516023e-05, | |
| "loss": 0.2039, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 39.68176670441676, | |
| "eval_loss": 0.48166778683662415, | |
| "eval_runtime": 217.3046, | |
| "eval_samples_per_second": 7.22, | |
| "eval_steps_per_second": 0.907, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 39.79501698754247, | |
| "grad_norm": 1.5010417699813843, | |
| "learning_rate": 2.3224829674489023e-05, | |
| "loss": 0.21, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 39.908267270668176, | |
| "grad_norm": 2.1489455699920654, | |
| "learning_rate": 2.2720161493817816e-05, | |
| "loss": 0.2042, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 39.908267270668176, | |
| "eval_loss": 0.4749002754688263, | |
| "eval_runtime": 217.3286, | |
| "eval_samples_per_second": 7.219, | |
| "eval_steps_per_second": 0.906, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 40.02265005662514, | |
| "grad_norm": 1.933009147644043, | |
| "learning_rate": 2.221549331314661e-05, | |
| "loss": 0.2157, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 40.13590033975085, | |
| "grad_norm": 1.5398054122924805, | |
| "learning_rate": 2.1710825132475398e-05, | |
| "loss": 0.1894, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 40.13590033975085, | |
| "eval_loss": 0.4718286097049713, | |
| "eval_runtime": 217.1637, | |
| "eval_samples_per_second": 7.225, | |
| "eval_steps_per_second": 0.907, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 40.249150622876556, | |
| "grad_norm": 1.7476941347122192, | |
| "learning_rate": 2.120615695180419e-05, | |
| "loss": 0.1924, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 40.362400906002264, | |
| "grad_norm": 1.5386378765106201, | |
| "learning_rate": 2.0701488771132983e-05, | |
| "loss": 0.1918, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 40.362400906002264, | |
| "eval_loss": 0.46827250719070435, | |
| "eval_runtime": 217.1473, | |
| "eval_samples_per_second": 7.226, | |
| "eval_steps_per_second": 0.907, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 40.47565118912797, | |
| "grad_norm": 1.6006604433059692, | |
| "learning_rate": 2.0196820590461773e-05, | |
| "loss": 0.188, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 40.58890147225368, | |
| "grad_norm": 1.5906981229782104, | |
| "learning_rate": 1.9692152409790562e-05, | |
| "loss": 0.1922, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 40.58890147225368, | |
| "eval_loss": 0.4618977904319763, | |
| "eval_runtime": 217.1294, | |
| "eval_samples_per_second": 7.226, | |
| "eval_steps_per_second": 0.907, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 40.70215175537939, | |
| "grad_norm": 1.451889991760254, | |
| "learning_rate": 1.9187484229119355e-05, | |
| "loss": 0.1961, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 40.8154020385051, | |
| "grad_norm": 1.2037873268127441, | |
| "learning_rate": 1.8682816048448147e-05, | |
| "loss": 0.1951, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 40.8154020385051, | |
| "eval_loss": 0.45853978395462036, | |
| "eval_runtime": 217.1867, | |
| "eval_samples_per_second": 7.224, | |
| "eval_steps_per_second": 0.907, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 40.92865232163081, | |
| "grad_norm": 1.124363899230957, | |
| "learning_rate": 1.8178147867776936e-05, | |
| "loss": 0.1907, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 41.04303510758777, | |
| "grad_norm": 1.1726500988006592, | |
| "learning_rate": 1.767347968710573e-05, | |
| "loss": 0.1893, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 41.04303510758777, | |
| "eval_loss": 0.4549981355667114, | |
| "eval_runtime": 217.2331, | |
| "eval_samples_per_second": 7.223, | |
| "eval_steps_per_second": 0.907, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 41.15628539071348, | |
| "grad_norm": 1.6041500568389893, | |
| "learning_rate": 1.7168811506434522e-05, | |
| "loss": 0.1769, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 41.26953567383919, | |
| "grad_norm": 1.9704344272613525, | |
| "learning_rate": 1.666414332576331e-05, | |
| "loss": 0.1798, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 41.26953567383919, | |
| "eval_loss": 0.45383498072624207, | |
| "eval_runtime": 217.2468, | |
| "eval_samples_per_second": 7.222, | |
| "eval_steps_per_second": 0.907, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 41.382785956964895, | |
| "grad_norm": 1.1522181034088135, | |
| "learning_rate": 1.6159475145092104e-05, | |
| "loss": 0.1858, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 41.4960362400906, | |
| "grad_norm": 1.6338062286376953, | |
| "learning_rate": 1.5654806964420893e-05, | |
| "loss": 0.1776, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 41.4960362400906, | |
| "eval_loss": 0.448618620634079, | |
| "eval_runtime": 217.2186, | |
| "eval_samples_per_second": 7.223, | |
| "eval_steps_per_second": 0.907, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 41.609286523216305, | |
| "grad_norm": 1.1537904739379883, | |
| "learning_rate": 1.5150138783749684e-05, | |
| "loss": 0.1759, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 41.72253680634201, | |
| "grad_norm": 1.285271406173706, | |
| "learning_rate": 1.4645470603078477e-05, | |
| "loss": 0.1794, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 41.72253680634201, | |
| "eval_loss": 0.4447907507419586, | |
| "eval_runtime": 217.1396, | |
| "eval_samples_per_second": 7.226, | |
| "eval_steps_per_second": 0.907, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 41.83578708946772, | |
| "grad_norm": 1.125063419342041, | |
| "learning_rate": 1.4140802422407268e-05, | |
| "loss": 0.1756, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 41.94903737259343, | |
| "grad_norm": 1.1060149669647217, | |
| "learning_rate": 1.3636134241736059e-05, | |
| "loss": 0.1787, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 41.94903737259343, | |
| "eval_loss": 0.4420225918292999, | |
| "eval_runtime": 217.4988, | |
| "eval_samples_per_second": 7.214, | |
| "eval_steps_per_second": 0.906, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 42.06342015855039, | |
| "grad_norm": 1.0146502256393433, | |
| "learning_rate": 1.3131466061064851e-05, | |
| "loss": 0.1791, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 42.1766704416761, | |
| "grad_norm": 1.1884300708770752, | |
| "learning_rate": 1.2626797880393642e-05, | |
| "loss": 0.1658, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 42.1766704416761, | |
| "eval_loss": 0.4396124482154846, | |
| "eval_runtime": 217.7883, | |
| "eval_samples_per_second": 7.204, | |
| "eval_steps_per_second": 0.905, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 42.28992072480181, | |
| "grad_norm": 1.1497679948806763, | |
| "learning_rate": 1.2122129699722433e-05, | |
| "loss": 0.1696, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 42.40317100792752, | |
| "grad_norm": 1.32937490940094, | |
| "learning_rate": 1.1617461519051224e-05, | |
| "loss": 0.1643, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 42.40317100792752, | |
| "eval_loss": 0.43940281867980957, | |
| "eval_runtime": 218.6239, | |
| "eval_samples_per_second": 7.177, | |
| "eval_steps_per_second": 0.901, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 42.51642129105323, | |
| "grad_norm": 1.5960180759429932, | |
| "learning_rate": 1.1112793338380017e-05, | |
| "loss": 0.1699, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 42.629671574178936, | |
| "grad_norm": 1.0415377616882324, | |
| "learning_rate": 1.0608125157708806e-05, | |
| "loss": 0.1654, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 42.629671574178936, | |
| "eval_loss": 0.43373051285743713, | |
| "eval_runtime": 218.5653, | |
| "eval_samples_per_second": 7.179, | |
| "eval_steps_per_second": 0.901, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 42.742921857304644, | |
| "grad_norm": 1.5094951391220093, | |
| "learning_rate": 1.0103456977037597e-05, | |
| "loss": 0.1669, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 42.85617214043035, | |
| "grad_norm": 0.9974751472473145, | |
| "learning_rate": 9.59878879636639e-06, | |
| "loss": 0.1681, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 42.85617214043035, | |
| "eval_loss": 0.4303882420063019, | |
| "eval_runtime": 218.6322, | |
| "eval_samples_per_second": 7.176, | |
| "eval_steps_per_second": 0.901, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 42.96942242355606, | |
| "grad_norm": 0.9117754697799683, | |
| "learning_rate": 9.094120615695181e-06, | |
| "loss": 0.1706, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 43.083805209513024, | |
| "grad_norm": 1.0373188257217407, | |
| "learning_rate": 8.589452435023972e-06, | |
| "loss": 0.1643, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 43.083805209513024, | |
| "eval_loss": 0.42856693267822266, | |
| "eval_runtime": 218.5993, | |
| "eval_samples_per_second": 7.178, | |
| "eval_steps_per_second": 0.901, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 43.19705549263873, | |
| "grad_norm": 0.9998382329940796, | |
| "learning_rate": 8.084784254352763e-06, | |
| "loss": 0.1617, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 43.31030577576444, | |
| "grad_norm": 0.9849778413772583, | |
| "learning_rate": 7.580116073681555e-06, | |
| "loss": 0.1603, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 43.31030577576444, | |
| "eval_loss": 0.4269334077835083, | |
| "eval_runtime": 218.6737, | |
| "eval_samples_per_second": 7.175, | |
| "eval_steps_per_second": 0.901, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 43.42355605889015, | |
| "grad_norm": 1.2009530067443848, | |
| "learning_rate": 7.0754478930103465e-06, | |
| "loss": 0.157, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 43.53680634201586, | |
| "grad_norm": 0.8868136405944824, | |
| "learning_rate": 6.570779712339137e-06, | |
| "loss": 0.1582, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 43.53680634201586, | |
| "eval_loss": 0.42409417033195496, | |
| "eval_runtime": 218.6076, | |
| "eval_samples_per_second": 7.177, | |
| "eval_steps_per_second": 0.901, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 43.650056625141566, | |
| "grad_norm": 0.8435959815979004, | |
| "learning_rate": 6.0661115316679285e-06, | |
| "loss": 0.158, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 43.76330690826727, | |
| "grad_norm": 1.1476356983184814, | |
| "learning_rate": 5.56144335099672e-06, | |
| "loss": 0.1608, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 43.76330690826727, | |
| "eval_loss": 0.422664076089859, | |
| "eval_runtime": 218.5875, | |
| "eval_samples_per_second": 7.178, | |
| "eval_steps_per_second": 0.901, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 43.876557191392976, | |
| "grad_norm": 0.765332043170929, | |
| "learning_rate": 5.056775170325511e-06, | |
| "loss": 0.1606, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 43.989807474518685, | |
| "grad_norm": 0.9879748821258545, | |
| "learning_rate": 4.552106989654302e-06, | |
| "loss": 0.1573, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 43.989807474518685, | |
| "eval_loss": 0.4201904535293579, | |
| "eval_runtime": 218.5744, | |
| "eval_samples_per_second": 7.178, | |
| "eval_steps_per_second": 0.901, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 44.104190260475654, | |
| "grad_norm": 0.6540424227714539, | |
| "learning_rate": 4.047438808983093e-06, | |
| "loss": 0.1572, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 44.217440543601356, | |
| "grad_norm": 0.9124572277069092, | |
| "learning_rate": 3.542770628311885e-06, | |
| "loss": 0.1498, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 44.217440543601356, | |
| "eval_loss": 0.4200960695743561, | |
| "eval_runtime": 218.5932, | |
| "eval_samples_per_second": 7.178, | |
| "eval_steps_per_second": 0.901, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 44.330690826727064, | |
| "grad_norm": 0.8609676957130432, | |
| "learning_rate": 3.0381024476406765e-06, | |
| "loss": 0.1509, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 44.44394110985277, | |
| "grad_norm": 0.7417690753936768, | |
| "learning_rate": 2.533434266969468e-06, | |
| "loss": 0.1492, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 44.44394110985277, | |
| "eval_loss": 0.41948238015174866, | |
| "eval_runtime": 218.5587, | |
| "eval_samples_per_second": 7.179, | |
| "eval_steps_per_second": 0.901, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 44.55719139297848, | |
| "grad_norm": 0.8361729979515076, | |
| "learning_rate": 2.0287660862982593e-06, | |
| "loss": 0.1541, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 44.67044167610419, | |
| "grad_norm": 0.911729097366333, | |
| "learning_rate": 1.5240979056270503e-06, | |
| "loss": 0.1559, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 44.67044167610419, | |
| "eval_loss": 0.41870439052581787, | |
| "eval_runtime": 218.5239, | |
| "eval_samples_per_second": 7.18, | |
| "eval_steps_per_second": 0.902, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 44.7836919592299, | |
| "grad_norm": 0.7706825733184814, | |
| "learning_rate": 1.0194297249558415e-06, | |
| "loss": 0.1554, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 44.89694224235561, | |
| "grad_norm": 0.9403465986251831, | |
| "learning_rate": 5.147615442846329e-07, | |
| "loss": 0.1549, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 44.89694224235561, | |
| "eval_loss": 0.4180174469947815, | |
| "eval_runtime": 218.6086, | |
| "eval_samples_per_second": 7.177, | |
| "eval_steps_per_second": 0.901, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 45.01132502831257, | |
| "grad_norm": 0.9403154253959656, | |
| "learning_rate": 8.532637580325652e-06, | |
| "loss": 0.1533, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 45.12457531143828, | |
| "grad_norm": 0.8529797196388245, | |
| "learning_rate": 8.049475769435185e-06, | |
| "loss": 0.1507, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 45.12457531143828, | |
| "eval_loss": 0.41985705494880676, | |
| "eval_runtime": 218.5997, | |
| "eval_samples_per_second": 7.178, | |
| "eval_steps_per_second": 0.901, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 45.237825594563986, | |
| "grad_norm": 0.877526581287384, | |
| "learning_rate": 7.5663139585447175e-06, | |
| "loss": 0.1498, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 45.351075877689695, | |
| "grad_norm": 0.9668393731117249, | |
| "learning_rate": 7.0831521476542495e-06, | |
| "loss": 0.152, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 45.351075877689695, | |
| "eval_loss": 0.41871750354766846, | |
| "eval_runtime": 218.6395, | |
| "eval_samples_per_second": 7.176, | |
| "eval_steps_per_second": 0.901, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 45.4643261608154, | |
| "grad_norm": 1.0251694917678833, | |
| "learning_rate": 6.599990336763782e-06, | |
| "loss": 0.1529, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 45.57757644394111, | |
| "grad_norm": 1.4579505920410156, | |
| "learning_rate": 6.116828525873315e-06, | |
| "loss": 0.1571, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 45.57757644394111, | |
| "eval_loss": 0.4161282181739807, | |
| "eval_runtime": 218.584, | |
| "eval_samples_per_second": 7.178, | |
| "eval_steps_per_second": 0.901, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 45.69082672706682, | |
| "grad_norm": 0.7462686896324158, | |
| "learning_rate": 5.633666714982848e-06, | |
| "loss": 0.1611, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 45.80407701019253, | |
| "grad_norm": 0.9031079411506653, | |
| "learning_rate": 5.150504904092381e-06, | |
| "loss": 0.153, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 45.80407701019253, | |
| "eval_loss": 0.41474393010139465, | |
| "eval_runtime": 218.6388, | |
| "eval_samples_per_second": 7.176, | |
| "eval_steps_per_second": 0.901, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 45.91732729331823, | |
| "grad_norm": 0.8560954332351685, | |
| "learning_rate": 4.667343093201913e-06, | |
| "loss": 0.1531, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 46.0317100792752, | |
| "grad_norm": 1.1464442014694214, | |
| "learning_rate": 4.184181282311446e-06, | |
| "loss": 0.1535, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 46.0317100792752, | |
| "eval_loss": 0.414587140083313, | |
| "eval_runtime": 218.5994, | |
| "eval_samples_per_second": 7.178, | |
| "eval_steps_per_second": 0.901, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 46.14496036240091, | |
| "grad_norm": 0.8384661674499512, | |
| "learning_rate": 3.7010194714209794e-06, | |
| "loss": 0.1488, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 46.25821064552662, | |
| "grad_norm": 0.8300140500068665, | |
| "learning_rate": 3.217857660530512e-06, | |
| "loss": 0.1507, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 46.25821064552662, | |
| "eval_loss": 0.413276344537735, | |
| "eval_runtime": 218.6607, | |
| "eval_samples_per_second": 7.176, | |
| "eval_steps_per_second": 0.901, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 46.37146092865232, | |
| "grad_norm": 0.7903048396110535, | |
| "learning_rate": 2.7346958496400447e-06, | |
| "loss": 0.148, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 46.48471121177803, | |
| "grad_norm": 0.888008713722229, | |
| "learning_rate": 2.2515340387495775e-06, | |
| "loss": 0.1447, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 46.48471121177803, | |
| "eval_loss": 0.4132575988769531, | |
| "eval_runtime": 218.6308, | |
| "eval_samples_per_second": 7.176, | |
| "eval_steps_per_second": 0.901, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 46.597961494903736, | |
| "grad_norm": 0.975723147392273, | |
| "learning_rate": 1.7683722278591102e-06, | |
| "loss": 0.1448, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 46.711211778029444, | |
| "grad_norm": 0.7616918087005615, | |
| "learning_rate": 1.2852104169686428e-06, | |
| "loss": 0.1489, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 46.711211778029444, | |
| "eval_loss": 0.4121854305267334, | |
| "eval_runtime": 218.5727, | |
| "eval_samples_per_second": 7.178, | |
| "eval_steps_per_second": 0.901, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 46.82446206115515, | |
| "grad_norm": 0.8662727475166321, | |
| "learning_rate": 8.117118422959849e-07, | |
| "loss": 0.1483, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 46.93771234428086, | |
| "grad_norm": 0.7502096891403198, | |
| "learning_rate": 3.2855003140551773e-07, | |
| "loss": 0.1504, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 46.93771234428086, | |
| "eval_loss": 0.41195544600486755, | |
| "eval_runtime": 218.7035, | |
| "eval_samples_per_second": 7.174, | |
| "eval_steps_per_second": 0.901, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 47.052095130237824, | |
| "grad_norm": 0.9510757923126221, | |
| "learning_rate": 1.1871026339691191e-05, | |
| "loss": 0.1467, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 47.16534541336353, | |
| "grad_norm": 1.0743557214736938, | |
| "learning_rate": 1.1416893732970029e-05, | |
| "loss": 0.1497, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 47.16534541336353, | |
| "eval_loss": 0.4156714379787445, | |
| "eval_runtime": 217.4784, | |
| "eval_samples_per_second": 7.215, | |
| "eval_steps_per_second": 0.906, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 47.27859569648924, | |
| "grad_norm": 1.567784070968628, | |
| "learning_rate": 1.0962761126248864e-05, | |
| "loss": 0.1513, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 47.39184597961495, | |
| "grad_norm": 1.3992472887039185, | |
| "learning_rate": 1.0508628519527702e-05, | |
| "loss": 0.1533, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 47.39184597961495, | |
| "eval_loss": 0.4152044653892517, | |
| "eval_runtime": 217.5597, | |
| "eval_samples_per_second": 7.212, | |
| "eval_steps_per_second": 0.905, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 47.50509626274066, | |
| "grad_norm": 1.5980275869369507, | |
| "learning_rate": 1.005449591280654e-05, | |
| "loss": 0.1523, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 47.618346545866366, | |
| "grad_norm": 1.2810208797454834, | |
| "learning_rate": 9.600363306085377e-06, | |
| "loss": 0.1502, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 47.618346545866366, | |
| "eval_loss": 0.4143332839012146, | |
| "eval_runtime": 218.7933, | |
| "eval_samples_per_second": 7.171, | |
| "eval_steps_per_second": 0.9, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 47.731596828992075, | |
| "grad_norm": 1.4590628147125244, | |
| "learning_rate": 9.146230699364216e-06, | |
| "loss": 0.1512, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 47.84484711211778, | |
| "grad_norm": 1.3043591976165771, | |
| "learning_rate": 8.692098092643053e-06, | |
| "loss": 0.1561, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 47.84484711211778, | |
| "eval_loss": 0.41214144229888916, | |
| "eval_runtime": 218.7674, | |
| "eval_samples_per_second": 7.172, | |
| "eval_steps_per_second": 0.9, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 47.958097395243485, | |
| "grad_norm": 0.8709500432014465, | |
| "learning_rate": 8.247048138056313e-06, | |
| "loss": 0.1478, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 48.072480181200454, | |
| "grad_norm": 1.005632758140564, | |
| "learning_rate": 7.79291553133515e-06, | |
| "loss": 0.1534, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 48.072480181200454, | |
| "eval_loss": 0.4120267927646637, | |
| "eval_runtime": 218.9152, | |
| "eval_samples_per_second": 7.167, | |
| "eval_steps_per_second": 0.9, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 48.18573046432616, | |
| "grad_norm": 1.2001721858978271, | |
| "learning_rate": 7.347865576748411e-06, | |
| "loss": 0.1431, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 48.29898074745187, | |
| "grad_norm": 1.2004830837249756, | |
| "learning_rate": 6.893732970027249e-06, | |
| "loss": 0.1457, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 48.29898074745187, | |
| "eval_loss": 0.4105300009250641, | |
| "eval_runtime": 218.8468, | |
| "eval_samples_per_second": 7.169, | |
| "eval_steps_per_second": 0.9, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 48.41223103057758, | |
| "grad_norm": 1.0889978408813477, | |
| "learning_rate": 6.439600363306085e-06, | |
| "loss": 0.1462, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 48.52548131370328, | |
| "grad_norm": 0.9354040026664734, | |
| "learning_rate": 5.985467756584924e-06, | |
| "loss": 0.1464, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 48.52548131370328, | |
| "eval_loss": 0.40966492891311646, | |
| "eval_runtime": 218.8783, | |
| "eval_samples_per_second": 7.168, | |
| "eval_steps_per_second": 0.9, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 48.63873159682899, | |
| "grad_norm": 0.8427848815917969, | |
| "learning_rate": 5.53133514986376e-06, | |
| "loss": 0.146, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 48.7519818799547, | |
| "grad_norm": 0.9390880465507507, | |
| "learning_rate": 5.077202543142598e-06, | |
| "loss": 0.1462, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 48.7519818799547, | |
| "eval_loss": 0.40723294019699097, | |
| "eval_runtime": 218.8819, | |
| "eval_samples_per_second": 7.168, | |
| "eval_steps_per_second": 0.9, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 48.86523216308041, | |
| "grad_norm": 1.0009453296661377, | |
| "learning_rate": 4.623069936421435e-06, | |
| "loss": 0.1442, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 48.978482446206115, | |
| "grad_norm": 1.11566960811615, | |
| "learning_rate": 4.168937329700273e-06, | |
| "loss": 0.1469, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 48.978482446206115, | |
| "eval_loss": 0.405407190322876, | |
| "eval_runtime": 218.8588, | |
| "eval_samples_per_second": 7.169, | |
| "eval_steps_per_second": 0.9, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 49.09286523216308, | |
| "grad_norm": 0.8854078054428101, | |
| "learning_rate": 3.71480472297911e-06, | |
| "loss": 0.1435, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 49.206115515288786, | |
| "grad_norm": 0.8558112978935242, | |
| "learning_rate": 3.260672116257948e-06, | |
| "loss": 0.1378, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 49.206115515288786, | |
| "eval_loss": 0.4061279296875, | |
| "eval_runtime": 218.8569, | |
| "eval_samples_per_second": 7.169, | |
| "eval_steps_per_second": 0.9, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 49.319365798414495, | |
| "grad_norm": 0.7999886870384216, | |
| "learning_rate": 2.806539509536785e-06, | |
| "loss": 0.1417, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 49.4326160815402, | |
| "grad_norm": 0.948358952999115, | |
| "learning_rate": 2.3524069028156224e-06, | |
| "loss": 0.1415, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 49.4326160815402, | |
| "eval_loss": 0.40446802973747253, | |
| "eval_runtime": 218.7745, | |
| "eval_samples_per_second": 7.172, | |
| "eval_steps_per_second": 0.9, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 49.54586636466591, | |
| "grad_norm": 0.7728579640388489, | |
| "learning_rate": 1.8982742960944597e-06, | |
| "loss": 0.1396, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 49.65911664779162, | |
| "grad_norm": 0.7241719365119934, | |
| "learning_rate": 1.4441416893732972e-06, | |
| "loss": 0.1398, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 49.65911664779162, | |
| "eval_loss": 0.4039037525653839, | |
| "eval_runtime": 218.8617, | |
| "eval_samples_per_second": 7.169, | |
| "eval_steps_per_second": 0.9, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 49.77236693091733, | |
| "grad_norm": 0.7789280414581299, | |
| "learning_rate": 9.900090826521344e-07, | |
| "loss": 0.1427, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 49.88561721404304, | |
| "grad_norm": 0.8703135848045349, | |
| "learning_rate": 5.358764759309719e-07, | |
| "loss": 0.139, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 49.88561721404304, | |
| "eval_loss": 0.40355798602104187, | |
| "eval_runtime": 218.7677, | |
| "eval_samples_per_second": 7.172, | |
| "eval_steps_per_second": 0.9, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 49.998867497168746, | |
| "grad_norm": 0.8729577660560608, | |
| "learning_rate": 8.174386920980928e-08, | |
| "loss": 0.1422, | |
| "step": 22050 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 22050, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.11661035307008e+18, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |