danhtran2mind's picture
Add files using upload-large-folder tool
4461551 verified
{
"best_global_step": 22000,
"best_metric": 0.40355798602104187,
"best_model_checkpoint": "Qwen-3-0.6B-it-Medical-LoRA/checkpoint-22000",
"epoch": 49.998867497168746,
"eval_steps": 100,
"global_step": 22050,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.11325028312570781,
"grad_norm": 0.3080866038799286,
"learning_rate": 0.00019075425790754258,
"loss": 1.8711,
"step": 50
},
{
"epoch": 0.22650056625141562,
"grad_norm": 0.28974413871765137,
"learning_rate": 0.0001664233576642336,
"loss": 1.338,
"step": 100
},
{
"epoch": 0.22650056625141562,
"eval_loss": 1.3033037185668945,
"eval_runtime": 217.1489,
"eval_samples_per_second": 7.225,
"eval_steps_per_second": 0.907,
"step": 100
},
{
"epoch": 0.33975084937712347,
"grad_norm": 0.3371483385562897,
"learning_rate": 0.0001420924574209246,
"loss": 1.2923,
"step": 150
},
{
"epoch": 0.45300113250283125,
"grad_norm": 0.35730767250061035,
"learning_rate": 0.00011776155717761557,
"loss": 1.2703,
"step": 200
},
{
"epoch": 0.45300113250283125,
"eval_loss": 1.2490053176879883,
"eval_runtime": 217.4793,
"eval_samples_per_second": 7.214,
"eval_steps_per_second": 0.906,
"step": 200
},
{
"epoch": 0.5662514156285391,
"grad_norm": 0.35110780596733093,
"learning_rate": 9.343065693430657e-05,
"loss": 1.2397,
"step": 250
},
{
"epoch": 0.6795016987542469,
"grad_norm": 0.35077276825904846,
"learning_rate": 6.909975669099758e-05,
"loss": 1.232,
"step": 300
},
{
"epoch": 0.6795016987542469,
"eval_loss": 1.2211977243423462,
"eval_runtime": 217.5364,
"eval_samples_per_second": 7.213,
"eval_steps_per_second": 0.906,
"step": 300
},
{
"epoch": 0.7927519818799547,
"grad_norm": 0.3939191699028015,
"learning_rate": 4.476885644768857e-05,
"loss": 1.2241,
"step": 350
},
{
"epoch": 0.9060022650056625,
"grad_norm": 0.366871178150177,
"learning_rate": 2.0437956204379563e-05,
"loss": 1.2078,
"step": 400
},
{
"epoch": 0.9060022650056625,
"eval_loss": 1.2062289714813232,
"eval_runtime": 217.4838,
"eval_samples_per_second": 7.214,
"eval_steps_per_second": 0.906,
"step": 400
},
{
"epoch": 1.0203850509626273,
"grad_norm": 0.3808969259262085,
"learning_rate": 0.00010164319248826291,
"loss": 1.1819,
"step": 450
},
{
"epoch": 1.1336353340883352,
"grad_norm": 0.43216949701309204,
"learning_rate": 8.990610328638498e-05,
"loss": 1.19,
"step": 500
},
{
"epoch": 1.1336353340883352,
"eval_loss": 1.1994948387145996,
"eval_runtime": 217.3626,
"eval_samples_per_second": 7.218,
"eval_steps_per_second": 0.906,
"step": 500
},
{
"epoch": 1.246885617214043,
"grad_norm": 0.4280295968055725,
"learning_rate": 7.816901408450704e-05,
"loss": 1.1971,
"step": 550
},
{
"epoch": 1.3601359003397508,
"grad_norm": 0.4056779146194458,
"learning_rate": 6.643192488262912e-05,
"loss": 1.1771,
"step": 600
},
{
"epoch": 1.3601359003397508,
"eval_loss": 1.1834282875061035,
"eval_runtime": 217.4514,
"eval_samples_per_second": 7.215,
"eval_steps_per_second": 0.906,
"step": 600
},
{
"epoch": 1.4733861834654587,
"grad_norm": 0.4397243857383728,
"learning_rate": 5.469483568075118e-05,
"loss": 1.1544,
"step": 650
},
{
"epoch": 1.5866364665911665,
"grad_norm": 0.4214654862880707,
"learning_rate": 4.295774647887324e-05,
"loss": 1.1789,
"step": 700
},
{
"epoch": 1.5866364665911665,
"eval_loss": 1.1712530851364136,
"eval_runtime": 217.6023,
"eval_samples_per_second": 7.21,
"eval_steps_per_second": 0.905,
"step": 700
},
{
"epoch": 1.6998867497168741,
"grad_norm": 0.43076109886169434,
"learning_rate": 3.1220657276995305e-05,
"loss": 1.1522,
"step": 750
},
{
"epoch": 1.8131370328425822,
"grad_norm": 0.4253358244895935,
"learning_rate": 1.9483568075117372e-05,
"loss": 1.1508,
"step": 800
},
{
"epoch": 1.8131370328425822,
"eval_loss": 1.1622345447540283,
"eval_runtime": 217.3951,
"eval_samples_per_second": 7.217,
"eval_steps_per_second": 0.906,
"step": 800
},
{
"epoch": 1.9263873159682898,
"grad_norm": 0.4359077215194702,
"learning_rate": 7.746478873239436e-06,
"loss": 1.1422,
"step": 850
},
{
"epoch": 2.0407701019252547,
"grad_norm": 0.4511992633342743,
"learning_rate": 0.00014314687602224403,
"loss": 1.149,
"step": 900
},
{
"epoch": 2.0407701019252547,
"eval_loss": 1.167581558227539,
"eval_runtime": 218.6888,
"eval_samples_per_second": 7.175,
"eval_steps_per_second": 0.901,
"step": 900
},
{
"epoch": 2.1540203850509627,
"grad_norm": 0.47519659996032715,
"learning_rate": 0.00013987569512594048,
"loss": 1.1498,
"step": 950
},
{
"epoch": 2.2672706681766703,
"grad_norm": 0.4559363126754761,
"learning_rate": 0.00013660451422963692,
"loss": 1.1469,
"step": 1000
},
{
"epoch": 2.2672706681766703,
"eval_loss": 1.1536333560943604,
"eval_runtime": 218.921,
"eval_samples_per_second": 7.167,
"eval_steps_per_second": 0.9,
"step": 1000
},
{
"epoch": 2.3805209513023784,
"grad_norm": 0.49805569648742676,
"learning_rate": 0.00013333333333333334,
"loss": 1.123,
"step": 1050
},
{
"epoch": 2.493771234428086,
"grad_norm": 0.4767671823501587,
"learning_rate": 0.00013006215243702978,
"loss": 1.1119,
"step": 1100
},
{
"epoch": 2.493771234428086,
"eval_loss": 1.1321617364883423,
"eval_runtime": 218.8468,
"eval_samples_per_second": 7.169,
"eval_steps_per_second": 0.9,
"step": 1100
},
{
"epoch": 2.607021517553794,
"grad_norm": 0.4313490688800812,
"learning_rate": 0.0001267909715407262,
"loss": 1.1096,
"step": 1150
},
{
"epoch": 2.7202718006795017,
"grad_norm": 0.46401792764663696,
"learning_rate": 0.00012351979064442265,
"loss": 1.0929,
"step": 1200
},
{
"epoch": 2.7202718006795017,
"eval_loss": 1.1138092279434204,
"eval_runtime": 219.2403,
"eval_samples_per_second": 7.157,
"eval_steps_per_second": 0.899,
"step": 1200
},
{
"epoch": 2.8335220838052093,
"grad_norm": 0.46533071994781494,
"learning_rate": 0.00012024860974811907,
"loss": 1.0894,
"step": 1250
},
{
"epoch": 2.9467723669309174,
"grad_norm": 0.42782357335090637,
"learning_rate": 0.00011697742885181551,
"loss": 1.072,
"step": 1300
},
{
"epoch": 2.9467723669309174,
"eval_loss": 1.0986168384552002,
"eval_runtime": 219.0662,
"eval_samples_per_second": 7.162,
"eval_steps_per_second": 0.899,
"step": 1300
},
{
"epoch": 3.061155152887882,
"grad_norm": 0.46029889583587646,
"learning_rate": 0.00011370624795551194,
"loss": 1.0936,
"step": 1350
},
{
"epoch": 3.17440543601359,
"grad_norm": 0.5067735314369202,
"learning_rate": 0.00011043506705920839,
"loss": 1.0304,
"step": 1400
},
{
"epoch": 3.17440543601359,
"eval_loss": 1.0839864015579224,
"eval_runtime": 219.2396,
"eval_samples_per_second": 7.157,
"eval_steps_per_second": 0.899,
"step": 1400
},
{
"epoch": 3.287655719139298,
"grad_norm": 0.46760454773902893,
"learning_rate": 0.0001071638861629048,
"loss": 1.0361,
"step": 1450
},
{
"epoch": 3.4009060022650055,
"grad_norm": 0.5199077129364014,
"learning_rate": 0.00010389270526660124,
"loss": 1.0304,
"step": 1500
},
{
"epoch": 3.4009060022650055,
"eval_loss": 1.070657730102539,
"eval_runtime": 219.1222,
"eval_samples_per_second": 7.16,
"eval_steps_per_second": 0.899,
"step": 1500
},
{
"epoch": 3.5141562853907136,
"grad_norm": 0.496124267578125,
"learning_rate": 0.00010062152437029768,
"loss": 1.0202,
"step": 1550
},
{
"epoch": 3.627406568516421,
"grad_norm": 0.5154497623443604,
"learning_rate": 9.735034347399413e-05,
"loss": 1.007,
"step": 1600
},
{
"epoch": 3.627406568516421,
"eval_loss": 1.0554137229919434,
"eval_runtime": 219.2648,
"eval_samples_per_second": 7.156,
"eval_steps_per_second": 0.898,
"step": 1600
},
{
"epoch": 3.7406568516421292,
"grad_norm": 0.4881006181240082,
"learning_rate": 9.407916257769055e-05,
"loss": 0.9934,
"step": 1650
},
{
"epoch": 3.853907134767837,
"grad_norm": 0.5507743954658508,
"learning_rate": 9.080798168138699e-05,
"loss": 0.9894,
"step": 1700
},
{
"epoch": 3.853907134767837,
"eval_loss": 1.0418345928192139,
"eval_runtime": 219.154,
"eval_samples_per_second": 7.159,
"eval_steps_per_second": 0.899,
"step": 1700
},
{
"epoch": 3.967157417893545,
"grad_norm": 0.5333808064460754,
"learning_rate": 8.753680078508342e-05,
"loss": 0.9795,
"step": 1750
},
{
"epoch": 4.081540203850509,
"grad_norm": 0.551164448261261,
"learning_rate": 8.426561988877985e-05,
"loss": 0.974,
"step": 1800
},
{
"epoch": 4.081540203850509,
"eval_loss": 1.0327671766281128,
"eval_runtime": 219.1392,
"eval_samples_per_second": 7.16,
"eval_steps_per_second": 0.899,
"step": 1800
},
{
"epoch": 4.194790486976218,
"grad_norm": 0.5678717494010925,
"learning_rate": 8.099443899247629e-05,
"loss": 0.9204,
"step": 1850
},
{
"epoch": 4.308040770101925,
"grad_norm": 0.5472707152366638,
"learning_rate": 7.772325809617273e-05,
"loss": 0.9341,
"step": 1900
},
{
"epoch": 4.308040770101925,
"eval_loss": 1.0188047885894775,
"eval_runtime": 219.1338,
"eval_samples_per_second": 7.16,
"eval_steps_per_second": 0.899,
"step": 1900
},
{
"epoch": 4.421291053227633,
"grad_norm": 0.5799363255500793,
"learning_rate": 7.445207719986915e-05,
"loss": 0.9313,
"step": 1950
},
{
"epoch": 4.534541336353341,
"grad_norm": 0.6046631336212158,
"learning_rate": 7.11808963035656e-05,
"loss": 0.9325,
"step": 2000
},
{
"epoch": 4.534541336353341,
"eval_loss": 1.004631519317627,
"eval_runtime": 219.1114,
"eval_samples_per_second": 7.161,
"eval_steps_per_second": 0.899,
"step": 2000
},
{
"epoch": 4.647791619479049,
"grad_norm": 0.5897740721702576,
"learning_rate": 6.790971540726203e-05,
"loss": 0.9213,
"step": 2050
},
{
"epoch": 4.761041902604757,
"grad_norm": 0.583991289138794,
"learning_rate": 6.463853451095846e-05,
"loss": 0.9039,
"step": 2100
},
{
"epoch": 4.761041902604757,
"eval_loss": 0.9938989877700806,
"eval_runtime": 219.0293,
"eval_samples_per_second": 7.163,
"eval_steps_per_second": 0.899,
"step": 2100
},
{
"epoch": 4.874292185730464,
"grad_norm": 0.6264305710792542,
"learning_rate": 6.13673536146549e-05,
"loss": 0.9028,
"step": 2150
},
{
"epoch": 4.987542468856172,
"grad_norm": 0.6474761962890625,
"learning_rate": 5.809617271835133e-05,
"loss": 0.9053,
"step": 2200
},
{
"epoch": 4.987542468856172,
"eval_loss": 0.9845430254936218,
"eval_runtime": 219.2502,
"eval_samples_per_second": 7.156,
"eval_steps_per_second": 0.899,
"step": 2200
},
{
"epoch": 5.101925254813137,
"grad_norm": 0.6595875024795532,
"learning_rate": 5.4824991822047765e-05,
"loss": 0.882,
"step": 2250
},
{
"epoch": 5.215175537938845,
"grad_norm": 0.6405232548713684,
"learning_rate": 5.15538109257442e-05,
"loss": 0.8471,
"step": 2300
},
{
"epoch": 5.215175537938845,
"eval_loss": 0.9782047867774963,
"eval_runtime": 219.0751,
"eval_samples_per_second": 7.162,
"eval_steps_per_second": 0.899,
"step": 2300
},
{
"epoch": 5.3284258210645525,
"grad_norm": 0.6547350287437439,
"learning_rate": 4.828263002944063e-05,
"loss": 0.8602,
"step": 2350
},
{
"epoch": 5.44167610419026,
"grad_norm": 0.7046269178390503,
"learning_rate": 4.501144913313706e-05,
"loss": 0.8404,
"step": 2400
},
{
"epoch": 5.44167610419026,
"eval_loss": 0.9688066244125366,
"eval_runtime": 219.0622,
"eval_samples_per_second": 7.162,
"eval_steps_per_second": 0.899,
"step": 2400
},
{
"epoch": 5.554926387315969,
"grad_norm": 0.6331756114959717,
"learning_rate": 4.17402682368335e-05,
"loss": 0.8286,
"step": 2450
},
{
"epoch": 5.668176670441676,
"grad_norm": 0.7212900519371033,
"learning_rate": 3.846908734052994e-05,
"loss": 0.8382,
"step": 2500
},
{
"epoch": 5.668176670441676,
"eval_loss": 0.9589976668357849,
"eval_runtime": 219.1257,
"eval_samples_per_second": 7.16,
"eval_steps_per_second": 0.899,
"step": 2500
},
{
"epoch": 5.781426953567384,
"grad_norm": 0.6771254539489746,
"learning_rate": 3.519790644422637e-05,
"loss": 0.8359,
"step": 2550
},
{
"epoch": 5.8946772366930915,
"grad_norm": 0.7171376943588257,
"learning_rate": 3.19267255479228e-05,
"loss": 0.832,
"step": 2600
},
{
"epoch": 5.8946772366930915,
"eval_loss": 0.9501948952674866,
"eval_runtime": 219.0768,
"eval_samples_per_second": 7.162,
"eval_steps_per_second": 0.899,
"step": 2600
},
{
"epoch": 6.009060022650057,
"grad_norm": 0.6734739542007446,
"learning_rate": 2.865554465161924e-05,
"loss": 0.8437,
"step": 2650
},
{
"epoch": 6.122310305775764,
"grad_norm": 0.697407603263855,
"learning_rate": 2.538436375531567e-05,
"loss": 0.7937,
"step": 2700
},
{
"epoch": 6.122310305775764,
"eval_loss": 0.9480313658714294,
"eval_runtime": 218.949,
"eval_samples_per_second": 7.166,
"eval_steps_per_second": 0.9,
"step": 2700
},
{
"epoch": 6.235560588901472,
"grad_norm": 0.7092292904853821,
"learning_rate": 2.2113182859012105e-05,
"loss": 0.7804,
"step": 2750
},
{
"epoch": 6.34881087202718,
"grad_norm": 0.7284964919090271,
"learning_rate": 1.884200196270854e-05,
"loss": 0.7861,
"step": 2800
},
{
"epoch": 6.34881087202718,
"eval_loss": 0.942541241645813,
"eval_runtime": 219.1707,
"eval_samples_per_second": 7.159,
"eval_steps_per_second": 0.899,
"step": 2800
},
{
"epoch": 6.462061155152888,
"grad_norm": 0.7725135087966919,
"learning_rate": 1.557082106640497e-05,
"loss": 0.776,
"step": 2850
},
{
"epoch": 6.575311438278596,
"grad_norm": 0.7266800403594971,
"learning_rate": 1.2299640170101408e-05,
"loss": 0.7812,
"step": 2900
},
{
"epoch": 6.575311438278596,
"eval_loss": 0.939509928226471,
"eval_runtime": 219.1206,
"eval_samples_per_second": 7.16,
"eval_steps_per_second": 0.899,
"step": 2900
},
{
"epoch": 6.688561721404303,
"grad_norm": 0.7308298349380493,
"learning_rate": 9.028459273797842e-06,
"loss": 0.7827,
"step": 2950
},
{
"epoch": 6.801812004530011,
"grad_norm": 0.7362912893295288,
"learning_rate": 5.757278377494276e-06,
"loss": 0.7917,
"step": 3000
},
{
"epoch": 6.801812004530011,
"eval_loss": 0.9356247782707214,
"eval_runtime": 219.052,
"eval_samples_per_second": 7.163,
"eval_steps_per_second": 0.899,
"step": 3000
},
{
"epoch": 6.9150622876557195,
"grad_norm": 0.7543765902519226,
"learning_rate": 2.4860974811907098e-06,
"loss": 0.7738,
"step": 3050
},
{
"epoch": 7.029445073612684,
"grad_norm": 0.7134389877319336,
"learning_rate": 5.986301369863014e-05,
"loss": 0.7481,
"step": 3100
},
{
"epoch": 7.029445073612684,
"eval_loss": 0.9381225109100342,
"eval_runtime": 218.2269,
"eval_samples_per_second": 7.19,
"eval_steps_per_second": 0.903,
"step": 3100
},
{
"epoch": 7.1426953567383915,
"grad_norm": 0.8131405711174011,
"learning_rate": 5.757990867579909e-05,
"loss": 0.7725,
"step": 3150
},
{
"epoch": 7.2559456398641,
"grad_norm": 0.8759368062019348,
"learning_rate": 5.529680365296805e-05,
"loss": 0.7752,
"step": 3200
},
{
"epoch": 7.2559456398641,
"eval_loss": 0.9386877417564392,
"eval_runtime": 218.4203,
"eval_samples_per_second": 7.183,
"eval_steps_per_second": 0.902,
"step": 3200
},
{
"epoch": 7.369195922989808,
"grad_norm": 0.8374108076095581,
"learning_rate": 5.3013698630136986e-05,
"loss": 0.7765,
"step": 3250
},
{
"epoch": 7.482446206115515,
"grad_norm": 0.8505973815917969,
"learning_rate": 5.0730593607305946e-05,
"loss": 0.7791,
"step": 3300
},
{
"epoch": 7.482446206115515,
"eval_loss": 0.9266760349273682,
"eval_runtime": 218.4773,
"eval_samples_per_second": 7.182,
"eval_steps_per_second": 0.902,
"step": 3300
},
{
"epoch": 7.595696489241223,
"grad_norm": 0.8420349359512329,
"learning_rate": 4.8447488584474886e-05,
"loss": 0.7721,
"step": 3350
},
{
"epoch": 7.7089467723669305,
"grad_norm": 0.892084002494812,
"learning_rate": 4.616438356164384e-05,
"loss": 0.7626,
"step": 3400
},
{
"epoch": 7.7089467723669305,
"eval_loss": 0.9153051376342773,
"eval_runtime": 218.391,
"eval_samples_per_second": 7.184,
"eval_steps_per_second": 0.902,
"step": 3400
},
{
"epoch": 7.822197055492639,
"grad_norm": 1.0072320699691772,
"learning_rate": 4.3881278538812785e-05,
"loss": 0.7578,
"step": 3450
},
{
"epoch": 7.935447338618347,
"grad_norm": 0.841740608215332,
"learning_rate": 4.159817351598174e-05,
"loss": 0.755,
"step": 3500
},
{
"epoch": 7.935447338618347,
"eval_loss": 0.9030627012252808,
"eval_runtime": 218.5627,
"eval_samples_per_second": 7.179,
"eval_steps_per_second": 0.901,
"step": 3500
},
{
"epoch": 8.049830124575312,
"grad_norm": 0.9417058825492859,
"learning_rate": 3.9315068493150684e-05,
"loss": 0.7419,
"step": 3550
},
{
"epoch": 8.163080407701019,
"grad_norm": 0.8208181858062744,
"learning_rate": 3.703196347031964e-05,
"loss": 0.7079,
"step": 3600
},
{
"epoch": 8.163080407701019,
"eval_loss": 0.9004995226860046,
"eval_runtime": 218.599,
"eval_samples_per_second": 7.178,
"eval_steps_per_second": 0.901,
"step": 3600
},
{
"epoch": 8.276330690826727,
"grad_norm": 0.8969956040382385,
"learning_rate": 3.4748858447488584e-05,
"loss": 0.7184,
"step": 3650
},
{
"epoch": 8.389580973952436,
"grad_norm": 0.9903959631919861,
"learning_rate": 3.246575342465754e-05,
"loss": 0.6977,
"step": 3700
},
{
"epoch": 8.389580973952436,
"eval_loss": 0.895404577255249,
"eval_runtime": 218.551,
"eval_samples_per_second": 7.179,
"eval_steps_per_second": 0.901,
"step": 3700
},
{
"epoch": 8.502831257078142,
"grad_norm": 0.8987964391708374,
"learning_rate": 3.0182648401826487e-05,
"loss": 0.6981,
"step": 3750
},
{
"epoch": 8.61608154020385,
"grad_norm": 0.9351384043693542,
"learning_rate": 2.7899543378995436e-05,
"loss": 0.6985,
"step": 3800
},
{
"epoch": 8.61608154020385,
"eval_loss": 0.8867019414901733,
"eval_runtime": 218.5395,
"eval_samples_per_second": 7.179,
"eval_steps_per_second": 0.901,
"step": 3800
},
{
"epoch": 8.729331823329558,
"grad_norm": 0.9520925283432007,
"learning_rate": 2.5616438356164386e-05,
"loss": 0.7041,
"step": 3850
},
{
"epoch": 8.842582106455266,
"grad_norm": 0.9150193333625793,
"learning_rate": 2.3333333333333336e-05,
"loss": 0.6946,
"step": 3900
},
{
"epoch": 8.842582106455266,
"eval_loss": 0.8767301440238953,
"eval_runtime": 218.4183,
"eval_samples_per_second": 7.183,
"eval_steps_per_second": 0.902,
"step": 3900
},
{
"epoch": 8.955832389580975,
"grad_norm": 0.9718352556228638,
"learning_rate": 2.1050228310502286e-05,
"loss": 0.6837,
"step": 3950
},
{
"epoch": 9.070215175537939,
"grad_norm": 0.9025393724441528,
"learning_rate": 1.8767123287671235e-05,
"loss": 0.6821,
"step": 4000
},
{
"epoch": 9.070215175537939,
"eval_loss": 0.8735217452049255,
"eval_runtime": 218.5455,
"eval_samples_per_second": 7.179,
"eval_steps_per_second": 0.901,
"step": 4000
},
{
"epoch": 9.183465458663647,
"grad_norm": 0.9804911017417908,
"learning_rate": 1.6484018264840185e-05,
"loss": 0.6533,
"step": 4050
},
{
"epoch": 9.296715741789354,
"grad_norm": 0.8889093399047852,
"learning_rate": 1.4200913242009135e-05,
"loss": 0.6549,
"step": 4100
},
{
"epoch": 9.296715741789354,
"eval_loss": 0.8693042993545532,
"eval_runtime": 218.4928,
"eval_samples_per_second": 7.181,
"eval_steps_per_second": 0.902,
"step": 4100
},
{
"epoch": 9.409966024915063,
"grad_norm": 0.9306142926216125,
"learning_rate": 1.1917808219178083e-05,
"loss": 0.643,
"step": 4150
},
{
"epoch": 9.52321630804077,
"grad_norm": 1.0180792808532715,
"learning_rate": 9.634703196347032e-06,
"loss": 0.6498,
"step": 4200
},
{
"epoch": 9.52321630804077,
"eval_loss": 0.8649076223373413,
"eval_runtime": 218.6148,
"eval_samples_per_second": 7.177,
"eval_steps_per_second": 0.901,
"step": 4200
},
{
"epoch": 9.636466591166478,
"grad_norm": 1.038870930671692,
"learning_rate": 7.351598173515982e-06,
"loss": 0.6633,
"step": 4250
},
{
"epoch": 9.749716874292186,
"grad_norm": 0.9064520001411438,
"learning_rate": 5.068493150684932e-06,
"loss": 0.6503,
"step": 4300
},
{
"epoch": 9.749716874292186,
"eval_loss": 0.8624854683876038,
"eval_runtime": 218.607,
"eval_samples_per_second": 7.177,
"eval_steps_per_second": 0.901,
"step": 4300
},
{
"epoch": 9.862967157417893,
"grad_norm": 0.9673233032226562,
"learning_rate": 2.7853881278538815e-06,
"loss": 0.6439,
"step": 4350
},
{
"epoch": 9.976217440543602,
"grad_norm": 0.9512138366699219,
"learning_rate": 5.022831050228311e-07,
"loss": 0.6427,
"step": 4400
},
{
"epoch": 9.976217440543602,
"eval_loss": 0.8610817790031433,
"eval_runtime": 218.4692,
"eval_samples_per_second": 7.182,
"eval_steps_per_second": 0.902,
"step": 4400
},
{
"epoch": 10.090600226500566,
"grad_norm": 0.9980069994926453,
"learning_rate": 4.5028932140978435e-05,
"loss": 0.6435,
"step": 4450
},
{
"epoch": 10.203850509626275,
"grad_norm": 1.1042736768722534,
"learning_rate": 4.327546905137647e-05,
"loss": 0.6473,
"step": 4500
},
{
"epoch": 10.203850509626275,
"eval_loss": 0.8664056658744812,
"eval_runtime": 218.1832,
"eval_samples_per_second": 7.191,
"eval_steps_per_second": 0.903,
"step": 4500
},
{
"epoch": 10.317100792751981,
"grad_norm": 1.093027114868164,
"learning_rate": 4.1522005961774504e-05,
"loss": 0.6428,
"step": 4550
},
{
"epoch": 10.43035107587769,
"grad_norm": 1.1941519975662231,
"learning_rate": 3.9768542872172545e-05,
"loss": 0.6453,
"step": 4600
},
{
"epoch": 10.43035107587769,
"eval_loss": 0.8576545715332031,
"eval_runtime": 218.2133,
"eval_samples_per_second": 7.19,
"eval_steps_per_second": 0.903,
"step": 4600
},
{
"epoch": 10.543601359003397,
"grad_norm": 1.1875131130218506,
"learning_rate": 3.801507978257058e-05,
"loss": 0.6444,
"step": 4650
},
{
"epoch": 10.656851642129105,
"grad_norm": 1.057826042175293,
"learning_rate": 3.6261616692968614e-05,
"loss": 0.6374,
"step": 4700
},
{
"epoch": 10.656851642129105,
"eval_loss": 0.8471001982688904,
"eval_runtime": 218.248,
"eval_samples_per_second": 7.189,
"eval_steps_per_second": 0.903,
"step": 4700
},
{
"epoch": 10.770101925254814,
"grad_norm": 1.1481099128723145,
"learning_rate": 3.450815360336665e-05,
"loss": 0.6367,
"step": 4750
},
{
"epoch": 10.88335220838052,
"grad_norm": 1.043562412261963,
"learning_rate": 3.275469051376468e-05,
"loss": 0.6382,
"step": 4800
},
{
"epoch": 10.88335220838052,
"eval_loss": 0.8414534330368042,
"eval_runtime": 218.3266,
"eval_samples_per_second": 7.186,
"eval_steps_per_second": 0.902,
"step": 4800
},
{
"epoch": 10.996602491506229,
"grad_norm": 1.1026701927185059,
"learning_rate": 3.1001227424162724e-05,
"loss": 0.6363,
"step": 4850
},
{
"epoch": 11.110985277463193,
"grad_norm": 1.2548056840896606,
"learning_rate": 2.9247764334560758e-05,
"loss": 0.6197,
"step": 4900
},
{
"epoch": 11.110985277463193,
"eval_loss": 0.8344885110855103,
"eval_runtime": 218.346,
"eval_samples_per_second": 7.186,
"eval_steps_per_second": 0.902,
"step": 4900
},
{
"epoch": 11.224235560588902,
"grad_norm": 1.2327723503112793,
"learning_rate": 2.7494301244958792e-05,
"loss": 0.5955,
"step": 4950
},
{
"epoch": 11.337485843714608,
"grad_norm": 1.272136926651001,
"learning_rate": 2.5740838155356834e-05,
"loss": 0.5888,
"step": 5000
},
{
"epoch": 11.337485843714608,
"eval_loss": 0.8296782374382019,
"eval_runtime": 218.3829,
"eval_samples_per_second": 7.185,
"eval_steps_per_second": 0.902,
"step": 5000
},
{
"epoch": 11.450736126840317,
"grad_norm": 1.3154362440109253,
"learning_rate": 2.3987375065754868e-05,
"loss": 0.5821,
"step": 5050
},
{
"epoch": 11.563986409966025,
"grad_norm": 1.2641000747680664,
"learning_rate": 2.2233911976152902e-05,
"loss": 0.5786,
"step": 5100
},
{
"epoch": 11.563986409966025,
"eval_loss": 0.8227117657661438,
"eval_runtime": 218.3919,
"eval_samples_per_second": 7.184,
"eval_steps_per_second": 0.902,
"step": 5100
},
{
"epoch": 11.677236693091732,
"grad_norm": 1.308750033378601,
"learning_rate": 2.048044888655094e-05,
"loss": 0.5876,
"step": 5150
},
{
"epoch": 11.79048697621744,
"grad_norm": 1.2791666984558105,
"learning_rate": 1.8726985796948974e-05,
"loss": 0.5886,
"step": 5200
},
{
"epoch": 11.79048697621744,
"eval_loss": 0.8168981075286865,
"eval_runtime": 218.4082,
"eval_samples_per_second": 7.184,
"eval_steps_per_second": 0.902,
"step": 5200
},
{
"epoch": 11.90373725934315,
"grad_norm": 1.1309980154037476,
"learning_rate": 1.6973522707347012e-05,
"loss": 0.5816,
"step": 5250
},
{
"epoch": 12.018120045300114,
"grad_norm": 1.2232533693313599,
"learning_rate": 1.5220059617745046e-05,
"loss": 0.5993,
"step": 5300
},
{
"epoch": 12.018120045300114,
"eval_loss": 0.8122690916061401,
"eval_runtime": 218.3894,
"eval_samples_per_second": 7.184,
"eval_steps_per_second": 0.902,
"step": 5300
},
{
"epoch": 12.13137032842582,
"grad_norm": 1.1197330951690674,
"learning_rate": 1.3466596528143083e-05,
"loss": 0.552,
"step": 5350
},
{
"epoch": 12.244620611551529,
"grad_norm": 1.038383960723877,
"learning_rate": 1.171313343854112e-05,
"loss": 0.5461,
"step": 5400
},
{
"epoch": 12.244620611551529,
"eval_loss": 0.810990571975708,
"eval_runtime": 218.3359,
"eval_samples_per_second": 7.186,
"eval_steps_per_second": 0.902,
"step": 5400
},
{
"epoch": 12.357870894677237,
"grad_norm": 1.2155468463897705,
"learning_rate": 9.959670348939155e-06,
"loss": 0.5487,
"step": 5450
},
{
"epoch": 12.471121177802944,
"grad_norm": 1.0609550476074219,
"learning_rate": 8.20620725933719e-06,
"loss": 0.5524,
"step": 5500
},
{
"epoch": 12.471121177802944,
"eval_loss": 0.8065800070762634,
"eval_runtime": 218.3379,
"eval_samples_per_second": 7.186,
"eval_steps_per_second": 0.902,
"step": 5500
},
{
"epoch": 12.584371460928653,
"grad_norm": 1.1328603029251099,
"learning_rate": 6.452744169735227e-06,
"loss": 0.5437,
"step": 5550
},
{
"epoch": 12.69762174405436,
"grad_norm": 1.0544012784957886,
"learning_rate": 4.699281080133264e-06,
"loss": 0.5521,
"step": 5600
},
{
"epoch": 12.69762174405436,
"eval_loss": 0.8028028607368469,
"eval_runtime": 218.3014,
"eval_samples_per_second": 7.187,
"eval_steps_per_second": 0.902,
"step": 5600
},
{
"epoch": 12.810872027180068,
"grad_norm": 1.1334656476974487,
"learning_rate": 2.9458179905312994e-06,
"loss": 0.5537,
"step": 5650
},
{
"epoch": 12.924122310305776,
"grad_norm": 1.133388638496399,
"learning_rate": 1.1923549009293354e-06,
"loss": 0.5502,
"step": 5700
},
{
"epoch": 12.924122310305776,
"eval_loss": 0.801500141620636,
"eval_runtime": 218.1671,
"eval_samples_per_second": 7.192,
"eval_steps_per_second": 0.903,
"step": 5700
},
{
"epoch": 13.03850509626274,
"grad_norm": 1.093996524810791,
"learning_rate": 2.6302201974183753e-05,
"loss": 0.5348,
"step": 5750
},
{
"epoch": 13.15175537938845,
"grad_norm": 1.1750720739364624,
"learning_rate": 2.478359908883827e-05,
"loss": 0.5407,
"step": 5800
},
{
"epoch": 13.15175537938845,
"eval_loss": 0.8037387728691101,
"eval_runtime": 219.5954,
"eval_samples_per_second": 7.145,
"eval_steps_per_second": 0.897,
"step": 5800
},
{
"epoch": 13.265005662514156,
"grad_norm": 1.3370305299758911,
"learning_rate": 2.3264996203492785e-05,
"loss": 0.5427,
"step": 5850
},
{
"epoch": 13.378255945639864,
"grad_norm": 1.361132025718689,
"learning_rate": 2.1746393318147306e-05,
"loss": 0.5587,
"step": 5900
},
{
"epoch": 13.378255945639864,
"eval_loss": 0.7990919351577759,
"eval_runtime": 219.6972,
"eval_samples_per_second": 7.142,
"eval_steps_per_second": 0.897,
"step": 5900
},
{
"epoch": 13.491506228765571,
"grad_norm": 1.2097536325454712,
"learning_rate": 2.0227790432801824e-05,
"loss": 0.5378,
"step": 5950
},
{
"epoch": 13.60475651189128,
"grad_norm": 2.1065151691436768,
"learning_rate": 1.8709187547456342e-05,
"loss": 0.538,
"step": 6000
},
{
"epoch": 13.60475651189128,
"eval_loss": 0.7955650091171265,
"eval_runtime": 219.6995,
"eval_samples_per_second": 7.142,
"eval_steps_per_second": 0.897,
"step": 6000
},
{
"epoch": 13.718006795016988,
"grad_norm": 1.3054521083831787,
"learning_rate": 1.719058466211086e-05,
"loss": 0.5319,
"step": 6050
},
{
"epoch": 13.831257078142695,
"grad_norm": 1.2116392850875854,
"learning_rate": 1.5671981776765377e-05,
"loss": 0.5382,
"step": 6100
},
{
"epoch": 13.831257078142695,
"eval_loss": 0.7880010008811951,
"eval_runtime": 219.8172,
"eval_samples_per_second": 7.138,
"eval_steps_per_second": 0.896,
"step": 6100
},
{
"epoch": 13.944507361268403,
"grad_norm": 1.437024474143982,
"learning_rate": 1.4153378891419893e-05,
"loss": 0.5361,
"step": 6150
},
{
"epoch": 14.058890147225368,
"grad_norm": 1.1516680717468262,
"learning_rate": 1.2665148063781321e-05,
"loss": 0.5453,
"step": 6200
},
{
"epoch": 14.058890147225368,
"eval_loss": 0.783509373664856,
"eval_runtime": 219.7823,
"eval_samples_per_second": 7.139,
"eval_steps_per_second": 0.896,
"step": 6200
},
{
"epoch": 14.172140430351076,
"grad_norm": 1.182915210723877,
"learning_rate": 1.114654517843584e-05,
"loss": 0.5085,
"step": 6250
},
{
"epoch": 14.285390713476783,
"grad_norm": 1.225037693977356,
"learning_rate": 9.627942293090357e-06,
"loss": 0.5112,
"step": 6300
},
{
"epoch": 14.285390713476783,
"eval_loss": 0.7822558283805847,
"eval_runtime": 219.842,
"eval_samples_per_second": 7.137,
"eval_steps_per_second": 0.896,
"step": 6300
},
{
"epoch": 14.398640996602492,
"grad_norm": 1.1970784664154053,
"learning_rate": 8.109339407744875e-06,
"loss": 0.5079,
"step": 6350
},
{
"epoch": 14.5118912797282,
"grad_norm": 1.1259725093841553,
"learning_rate": 6.590736522399393e-06,
"loss": 0.5129,
"step": 6400
},
{
"epoch": 14.5118912797282,
"eval_loss": 0.7796412110328674,
"eval_runtime": 219.8188,
"eval_samples_per_second": 7.138,
"eval_steps_per_second": 0.896,
"step": 6400
},
{
"epoch": 14.625141562853907,
"grad_norm": 1.236473798751831,
"learning_rate": 5.072133637053911e-06,
"loss": 0.5055,
"step": 6450
},
{
"epoch": 14.738391845979615,
"grad_norm": 1.1682021617889404,
"learning_rate": 3.553530751708428e-06,
"loss": 0.5074,
"step": 6500
},
{
"epoch": 14.738391845979615,
"eval_loss": 0.7759413719177246,
"eval_runtime": 219.6677,
"eval_samples_per_second": 7.143,
"eval_steps_per_second": 0.897,
"step": 6500
},
{
"epoch": 14.851642129105322,
"grad_norm": 1.190508484840393,
"learning_rate": 2.0349278663629463e-06,
"loss": 0.5103,
"step": 6550
},
{
"epoch": 14.96489241223103,
"grad_norm": 1.18021559715271,
"learning_rate": 5.163249810174639e-07,
"loss": 0.5011,
"step": 6600
},
{
"epoch": 14.96489241223103,
"eval_loss": 0.7753218412399292,
"eval_runtime": 219.7467,
"eval_samples_per_second": 7.14,
"eval_steps_per_second": 0.896,
"step": 6600
},
{
"epoch": 15.079275198187995,
"grad_norm": 1.3528636693954468,
"learning_rate": 3.262518968133536e-05,
"loss": 0.5176,
"step": 6650
},
{
"epoch": 15.192525481313703,
"grad_norm": 1.3476513624191284,
"learning_rate": 3.1360647445624685e-05,
"loss": 0.5032,
"step": 6700
},
{
"epoch": 15.192525481313703,
"eval_loss": 0.7792025804519653,
"eval_runtime": 219.73,
"eval_samples_per_second": 7.141,
"eval_steps_per_second": 0.897,
"step": 6700
},
{
"epoch": 15.305775764439412,
"grad_norm": 1.3222737312316895,
"learning_rate": 3.009610520991401e-05,
"loss": 0.5141,
"step": 6750
},
{
"epoch": 15.419026047565119,
"grad_norm": 1.3413212299346924,
"learning_rate": 2.883156297420334e-05,
"loss": 0.5071,
"step": 6800
},
{
"epoch": 15.419026047565119,
"eval_loss": 0.7755314707756042,
"eval_runtime": 219.6961,
"eval_samples_per_second": 7.142,
"eval_steps_per_second": 0.897,
"step": 6800
},
{
"epoch": 15.532276330690827,
"grad_norm": 1.4169390201568604,
"learning_rate": 2.7567020738492665e-05,
"loss": 0.5066,
"step": 6850
},
{
"epoch": 15.645526613816534,
"grad_norm": 1.499665379524231,
"learning_rate": 2.6302478502781997e-05,
"loss": 0.5098,
"step": 6900
},
{
"epoch": 15.645526613816534,
"eval_loss": 0.7675374150276184,
"eval_runtime": 219.7136,
"eval_samples_per_second": 7.141,
"eval_steps_per_second": 0.897,
"step": 6900
},
{
"epoch": 15.758776896942242,
"grad_norm": 1.408177137374878,
"learning_rate": 2.5037936267071323e-05,
"loss": 0.5074,
"step": 6950
},
{
"epoch": 15.87202718006795,
"grad_norm": 1.5971038341522217,
"learning_rate": 2.3773394031360648e-05,
"loss": 0.4994,
"step": 7000
},
{
"epoch": 15.87202718006795,
"eval_loss": 0.7616310715675354,
"eval_runtime": 219.6635,
"eval_samples_per_second": 7.143,
"eval_steps_per_second": 0.897,
"step": 7000
},
{
"epoch": 15.985277463193658,
"grad_norm": 1.4312022924423218,
"learning_rate": 2.2508851795649977e-05,
"loss": 0.5081,
"step": 7050
},
{
"epoch": 16.099660249150624,
"grad_norm": 1.4189964532852173,
"learning_rate": 2.1244309559939302e-05,
"loss": 0.4831,
"step": 7100
},
{
"epoch": 16.099660249150624,
"eval_loss": 0.758693277835846,
"eval_runtime": 219.6153,
"eval_samples_per_second": 7.144,
"eval_steps_per_second": 0.897,
"step": 7100
},
{
"epoch": 16.212910532276332,
"grad_norm": 1.429587960243225,
"learning_rate": 1.9979767324228628e-05,
"loss": 0.4677,
"step": 7150
},
{
"epoch": 16.326160815402037,
"grad_norm": 1.5730829238891602,
"learning_rate": 1.8715225088517957e-05,
"loss": 0.4744,
"step": 7200
},
{
"epoch": 16.326160815402037,
"eval_loss": 0.7522332668304443,
"eval_runtime": 219.6797,
"eval_samples_per_second": 7.142,
"eval_steps_per_second": 0.897,
"step": 7200
},
{
"epoch": 16.439411098527746,
"grad_norm": 1.3818005323410034,
"learning_rate": 1.7450682852807286e-05,
"loss": 0.4821,
"step": 7250
},
{
"epoch": 16.552661381653454,
"grad_norm": 1.3803259134292603,
"learning_rate": 1.618614061709661e-05,
"loss": 0.4839,
"step": 7300
},
{
"epoch": 16.552661381653454,
"eval_loss": 0.7453923225402832,
"eval_runtime": 219.5972,
"eval_samples_per_second": 7.145,
"eval_steps_per_second": 0.897,
"step": 7300
},
{
"epoch": 16.665911664779163,
"grad_norm": 1.4981536865234375,
"learning_rate": 1.492159838138594e-05,
"loss": 0.468,
"step": 7350
},
{
"epoch": 16.77916194790487,
"grad_norm": 1.3549158573150635,
"learning_rate": 1.3657056145675265e-05,
"loss": 0.461,
"step": 7400
},
{
"epoch": 16.77916194790487,
"eval_loss": 0.7414634823799133,
"eval_runtime": 219.6011,
"eval_samples_per_second": 7.145,
"eval_steps_per_second": 0.897,
"step": 7400
},
{
"epoch": 16.892412231030576,
"grad_norm": 1.4302562475204468,
"learning_rate": 1.2392513909964594e-05,
"loss": 0.477,
"step": 7450
},
{
"epoch": 17.006795016987542,
"grad_norm": 1.2383838891983032,
"learning_rate": 1.112797167425392e-05,
"loss": 0.4667,
"step": 7500
},
{
"epoch": 17.006795016987542,
"eval_loss": 0.7361006140708923,
"eval_runtime": 219.5894,
"eval_samples_per_second": 7.145,
"eval_steps_per_second": 0.897,
"step": 7500
},
{
"epoch": 17.12004530011325,
"grad_norm": 1.2482600212097168,
"learning_rate": 9.863429438543249e-06,
"loss": 0.4463,
"step": 7550
},
{
"epoch": 17.23329558323896,
"grad_norm": 1.264907956123352,
"learning_rate": 8.598887202832576e-06,
"loss": 0.4479,
"step": 7600
},
{
"epoch": 17.23329558323896,
"eval_loss": 0.7333863973617554,
"eval_runtime": 219.7057,
"eval_samples_per_second": 7.141,
"eval_steps_per_second": 0.897,
"step": 7600
},
{
"epoch": 17.346545866364664,
"grad_norm": 1.2122907638549805,
"learning_rate": 7.334344967121902e-06,
"loss": 0.4535,
"step": 7650
},
{
"epoch": 17.459796149490373,
"grad_norm": 1.176712989807129,
"learning_rate": 6.06980273141123e-06,
"loss": 0.4404,
"step": 7700
},
{
"epoch": 17.459796149490373,
"eval_loss": 0.7308885455131531,
"eval_runtime": 219.6198,
"eval_samples_per_second": 7.144,
"eval_steps_per_second": 0.897,
"step": 7700
},
{
"epoch": 17.57304643261608,
"grad_norm": 1.264377474784851,
"learning_rate": 4.805260495700556e-06,
"loss": 0.4425,
"step": 7750
},
{
"epoch": 17.68629671574179,
"grad_norm": 1.3030773401260376,
"learning_rate": 3.5407182599898835e-06,
"loss": 0.4393,
"step": 7800
},
{
"epoch": 17.68629671574179,
"eval_loss": 0.7286545634269714,
"eval_runtime": 219.6474,
"eval_samples_per_second": 7.143,
"eval_steps_per_second": 0.897,
"step": 7800
},
{
"epoch": 17.7995469988675,
"grad_norm": 1.362890601158142,
"learning_rate": 2.276176024279211e-06,
"loss": 0.4425,
"step": 7850
},
{
"epoch": 17.912797281993203,
"grad_norm": 1.2483875751495361,
"learning_rate": 1.0116337885685382e-06,
"loss": 0.4434,
"step": 7900
},
{
"epoch": 17.912797281993203,
"eval_loss": 0.7274926900863647,
"eval_runtime": 219.7356,
"eval_samples_per_second": 7.14,
"eval_steps_per_second": 0.897,
"step": 7900
},
{
"epoch": 18.02718006795017,
"grad_norm": 1.2370803356170654,
"learning_rate": 1.9840728100113766e-05,
"loss": 0.4474,
"step": 7950
},
{
"epoch": 18.140430351075878,
"grad_norm": 1.454135537147522,
"learning_rate": 1.8703071672354948e-05,
"loss": 0.4294,
"step": 8000
},
{
"epoch": 18.140430351075878,
"eval_loss": 0.7317793965339661,
"eval_runtime": 216.9455,
"eval_samples_per_second": 7.232,
"eval_steps_per_second": 0.908,
"step": 8000
},
{
"epoch": 18.253680634201586,
"grad_norm": 1.4219353199005127,
"learning_rate": 1.7565415244596133e-05,
"loss": 0.4362,
"step": 8050
},
{
"epoch": 18.366930917327295,
"grad_norm": 1.4157588481903076,
"learning_rate": 1.6427758816837314e-05,
"loss": 0.4369,
"step": 8100
},
{
"epoch": 18.366930917327295,
"eval_loss": 0.7285795211791992,
"eval_runtime": 217.2426,
"eval_samples_per_second": 7.222,
"eval_steps_per_second": 0.907,
"step": 8100
},
{
"epoch": 18.480181200453,
"grad_norm": 1.5712941884994507,
"learning_rate": 1.52901023890785e-05,
"loss": 0.4393,
"step": 8150
},
{
"epoch": 18.59343148357871,
"grad_norm": 1.3149316310882568,
"learning_rate": 1.4152445961319682e-05,
"loss": 0.4377,
"step": 8200
},
{
"epoch": 18.59343148357871,
"eval_loss": 0.7221394181251526,
"eval_runtime": 217.271,
"eval_samples_per_second": 7.221,
"eval_steps_per_second": 0.907,
"step": 8200
},
{
"epoch": 18.706681766704417,
"grad_norm": 1.4053345918655396,
"learning_rate": 1.3014789533560864e-05,
"loss": 0.4395,
"step": 8250
},
{
"epoch": 18.819932049830125,
"grad_norm": 1.4755219221115112,
"learning_rate": 1.1877133105802047e-05,
"loss": 0.4464,
"step": 8300
},
{
"epoch": 18.819932049830125,
"eval_loss": 0.7166544795036316,
"eval_runtime": 217.3739,
"eval_samples_per_second": 7.218,
"eval_steps_per_second": 0.906,
"step": 8300
},
{
"epoch": 18.933182332955834,
"grad_norm": 1.3762329816818237,
"learning_rate": 1.073947667804323e-05,
"loss": 0.4482,
"step": 8350
},
{
"epoch": 19.047565118912797,
"grad_norm": 1.1810795068740845,
"learning_rate": 9.62457337883959e-06,
"loss": 0.4264,
"step": 8400
},
{
"epoch": 19.047565118912797,
"eval_loss": 0.7138365507125854,
"eval_runtime": 217.4069,
"eval_samples_per_second": 7.217,
"eval_steps_per_second": 0.906,
"step": 8400
},
{
"epoch": 19.160815402038505,
"grad_norm": 1.51250422000885,
"learning_rate": 8.486916951080774e-06,
"loss": 0.4197,
"step": 8450
},
{
"epoch": 19.274065685164214,
"grad_norm": 1.3608779907226562,
"learning_rate": 7.349260523321957e-06,
"loss": 0.4149,
"step": 8500
},
{
"epoch": 19.274065685164214,
"eval_loss": 0.7112516760826111,
"eval_runtime": 217.3073,
"eval_samples_per_second": 7.22,
"eval_steps_per_second": 0.907,
"step": 8500
},
{
"epoch": 19.387315968289922,
"grad_norm": 1.3459504842758179,
"learning_rate": 6.21160409556314e-06,
"loss": 0.4151,
"step": 8550
},
{
"epoch": 19.500566251415627,
"grad_norm": 1.270430326461792,
"learning_rate": 5.073947667804323e-06,
"loss": 0.4107,
"step": 8600
},
{
"epoch": 19.500566251415627,
"eval_loss": 0.7087224721908569,
"eval_runtime": 217.4347,
"eval_samples_per_second": 7.216,
"eval_steps_per_second": 0.906,
"step": 8600
},
{
"epoch": 19.613816534541336,
"grad_norm": 1.147330641746521,
"learning_rate": 3.936291240045506e-06,
"loss": 0.4204,
"step": 8650
},
{
"epoch": 19.727066817667044,
"grad_norm": 1.3679783344268799,
"learning_rate": 2.8213879408418657e-06,
"loss": 0.4241,
"step": 8700
},
{
"epoch": 19.727066817667044,
"eval_loss": 0.705489456653595,
"eval_runtime": 217.5656,
"eval_samples_per_second": 7.212,
"eval_steps_per_second": 0.905,
"step": 8700
},
{
"epoch": 19.840317100792753,
"grad_norm": 1.2595313787460327,
"learning_rate": 1.6837315130830492e-06,
"loss": 0.4157,
"step": 8750
},
{
"epoch": 19.95356738391846,
"grad_norm": 1.3279147148132324,
"learning_rate": 5.460750853242321e-07,
"loss": 0.4127,
"step": 8800
},
{
"epoch": 19.95356738391846,
"eval_loss": 0.7047748565673828,
"eval_runtime": 217.3024,
"eval_samples_per_second": 7.22,
"eval_steps_per_second": 0.907,
"step": 8800
},
{
"epoch": 20.067950169875424,
"grad_norm": 1.4412195682525635,
"learning_rate": 2.5650153268070802e-05,
"loss": 0.4066,
"step": 8850
},
{
"epoch": 20.181200453001132,
"grad_norm": 1.591495156288147,
"learning_rate": 2.466132700484525e-05,
"loss": 0.4107,
"step": 8900
},
{
"epoch": 20.181200453001132,
"eval_loss": 0.7136498093605042,
"eval_runtime": 217.2853,
"eval_samples_per_second": 7.221,
"eval_steps_per_second": 0.907,
"step": 8900
},
{
"epoch": 20.29445073612684,
"grad_norm": 1.5843544006347656,
"learning_rate": 2.3672500741619698e-05,
"loss": 0.4249,
"step": 8950
},
{
"epoch": 20.40770101925255,
"grad_norm": 1.7842884063720703,
"learning_rate": 2.268367447839415e-05,
"loss": 0.4292,
"step": 9000
},
{
"epoch": 20.40770101925255,
"eval_loss": 0.7056994438171387,
"eval_runtime": 217.247,
"eval_samples_per_second": 7.222,
"eval_steps_per_second": 0.907,
"step": 9000
},
{
"epoch": 20.520951302378258,
"grad_norm": 1.8154791593551636,
"learning_rate": 2.1694848215168594e-05,
"loss": 0.4171,
"step": 9050
},
{
"epoch": 20.634201585503963,
"grad_norm": 1.810947060585022,
"learning_rate": 2.0706021951943045e-05,
"loss": 0.4254,
"step": 9100
},
{
"epoch": 20.634201585503963,
"eval_loss": 0.7009139060974121,
"eval_runtime": 217.3713,
"eval_samples_per_second": 7.218,
"eval_steps_per_second": 0.906,
"step": 9100
},
{
"epoch": 20.74745186862967,
"grad_norm": 1.6273292303085327,
"learning_rate": 1.9717195688717493e-05,
"loss": 0.4182,
"step": 9150
},
{
"epoch": 20.86070215175538,
"grad_norm": 1.865356206893921,
"learning_rate": 1.8728369425491945e-05,
"loss": 0.4143,
"step": 9200
},
{
"epoch": 20.86070215175538,
"eval_loss": 0.6963376402854919,
"eval_runtime": 217.269,
"eval_samples_per_second": 7.221,
"eval_steps_per_second": 0.907,
"step": 9200
},
{
"epoch": 20.973952434881088,
"grad_norm": 2.093496799468994,
"learning_rate": 1.773954316226639e-05,
"loss": 0.4116,
"step": 9250
},
{
"epoch": 21.08833522083805,
"grad_norm": 1.6501940488815308,
"learning_rate": 1.6750716899040837e-05,
"loss": 0.4071,
"step": 9300
},
{
"epoch": 21.08833522083805,
"eval_loss": 0.6935945153236389,
"eval_runtime": 217.1749,
"eval_samples_per_second": 7.225,
"eval_steps_per_second": 0.907,
"step": 9300
},
{
"epoch": 21.20158550396376,
"grad_norm": 1.4257782697677612,
"learning_rate": 1.576189063581529e-05,
"loss": 0.3964,
"step": 9350
},
{
"epoch": 21.314835787089468,
"grad_norm": 1.7246989011764526,
"learning_rate": 1.4773064372589737e-05,
"loss": 0.3856,
"step": 9400
},
{
"epoch": 21.314835787089468,
"eval_loss": 0.6908048391342163,
"eval_runtime": 217.1554,
"eval_samples_per_second": 7.225,
"eval_steps_per_second": 0.907,
"step": 9400
},
{
"epoch": 21.428086070215176,
"grad_norm": 1.5051772594451904,
"learning_rate": 1.3784238109364186e-05,
"loss": 0.3903,
"step": 9450
},
{
"epoch": 21.541336353340885,
"grad_norm": 1.4209738969802856,
"learning_rate": 1.2795411846138633e-05,
"loss": 0.3992,
"step": 9500
},
{
"epoch": 21.541336353340885,
"eval_loss": 0.6845880746841431,
"eval_runtime": 217.0726,
"eval_samples_per_second": 7.228,
"eval_steps_per_second": 0.908,
"step": 9500
},
{
"epoch": 21.65458663646659,
"grad_norm": 1.4793322086334229,
"learning_rate": 1.1806585582913082e-05,
"loss": 0.392,
"step": 9550
},
{
"epoch": 21.7678369195923,
"grad_norm": 1.5042359828948975,
"learning_rate": 1.0817759319687532e-05,
"loss": 0.3833,
"step": 9600
},
{
"epoch": 21.7678369195923,
"eval_loss": 0.6798712611198425,
"eval_runtime": 217.2033,
"eval_samples_per_second": 7.224,
"eval_steps_per_second": 0.907,
"step": 9600
},
{
"epoch": 21.881087202718007,
"grad_norm": 1.4992612600326538,
"learning_rate": 9.82893305646198e-06,
"loss": 0.3912,
"step": 9650
},
{
"epoch": 21.994337485843715,
"grad_norm": 1.4592713117599487,
"learning_rate": 8.840106793236428e-06,
"loss": 0.3931,
"step": 9700
},
{
"epoch": 21.994337485843715,
"eval_loss": 0.6735964417457581,
"eval_runtime": 217.2254,
"eval_samples_per_second": 7.223,
"eval_steps_per_second": 0.907,
"step": 9700
},
{
"epoch": 22.108720271800678,
"grad_norm": 1.3605159521102905,
"learning_rate": 7.851280530010878e-06,
"loss": 0.378,
"step": 9750
},
{
"epoch": 22.221970554926386,
"grad_norm": 1.4335530996322632,
"learning_rate": 6.862454266785326e-06,
"loss": 0.379,
"step": 9800
},
{
"epoch": 22.221970554926386,
"eval_loss": 0.6728909015655518,
"eval_runtime": 217.1793,
"eval_samples_per_second": 7.224,
"eval_steps_per_second": 0.907,
"step": 9800
},
{
"epoch": 22.335220838052095,
"grad_norm": 1.2988905906677246,
"learning_rate": 5.873628003559775e-06,
"loss": 0.371,
"step": 9850
},
{
"epoch": 22.448471121177803,
"grad_norm": 1.407586693763733,
"learning_rate": 4.884801740334224e-06,
"loss": 0.3719,
"step": 9900
},
{
"epoch": 22.448471121177803,
"eval_loss": 0.670095682144165,
"eval_runtime": 217.2529,
"eval_samples_per_second": 7.222,
"eval_steps_per_second": 0.907,
"step": 9900
},
{
"epoch": 22.561721404303512,
"grad_norm": 1.5192447900772095,
"learning_rate": 1.9627103228740338e-05,
"loss": 0.3763,
"step": 9950
},
{
"epoch": 22.674971687429217,
"grad_norm": 1.6283540725708008,
"learning_rate": 1.8717598908594817e-05,
"loss": 0.3834,
"step": 10000
},
{
"epoch": 22.674971687429217,
"eval_loss": 0.6722336411476135,
"eval_runtime": 219.0452,
"eval_samples_per_second": 7.163,
"eval_steps_per_second": 0.899,
"step": 10000
},
{
"epoch": 22.788221970554925,
"grad_norm": 1.8841089010238647,
"learning_rate": 1.7808094588449296e-05,
"loss": 0.3766,
"step": 10050
},
{
"epoch": 22.901472253680634,
"grad_norm": 1.6647872924804688,
"learning_rate": 1.6898590268303775e-05,
"loss": 0.379,
"step": 10100
},
{
"epoch": 22.901472253680634,
"eval_loss": 0.6667923331260681,
"eval_runtime": 219.0896,
"eval_samples_per_second": 7.161,
"eval_steps_per_second": 0.899,
"step": 10100
},
{
"epoch": 23.0158550396376,
"grad_norm": 1.56221604347229,
"learning_rate": 1.5989085948158254e-05,
"loss": 0.3724,
"step": 10150
},
{
"epoch": 23.12910532276331,
"grad_norm": 1.741861343383789,
"learning_rate": 1.5079581628012735e-05,
"loss": 0.3648,
"step": 10200
},
{
"epoch": 23.12910532276331,
"eval_loss": 0.6666680574417114,
"eval_runtime": 219.1233,
"eval_samples_per_second": 7.16,
"eval_steps_per_second": 0.899,
"step": 10200
},
{
"epoch": 23.242355605889014,
"grad_norm": 1.4197698831558228,
"learning_rate": 1.4170077307867214e-05,
"loss": 0.3622,
"step": 10250
},
{
"epoch": 23.355605889014722,
"grad_norm": 1.5689094066619873,
"learning_rate": 1.3260572987721692e-05,
"loss": 0.3633,
"step": 10300
},
{
"epoch": 23.355605889014722,
"eval_loss": 0.6614246368408203,
"eval_runtime": 219.1161,
"eval_samples_per_second": 7.161,
"eval_steps_per_second": 0.899,
"step": 10300
},
{
"epoch": 23.46885617214043,
"grad_norm": 1.73819899559021,
"learning_rate": 1.2351068667576171e-05,
"loss": 0.3665,
"step": 10350
},
{
"epoch": 23.58210645526614,
"grad_norm": 1.470841884613037,
"learning_rate": 1.1441564347430652e-05,
"loss": 0.3594,
"step": 10400
},
{
"epoch": 23.58210645526614,
"eval_loss": 0.6564630270004272,
"eval_runtime": 219.1222,
"eval_samples_per_second": 7.16,
"eval_steps_per_second": 0.899,
"step": 10400
},
{
"epoch": 23.695356738391848,
"grad_norm": 1.4712560176849365,
"learning_rate": 1.0532060027285131e-05,
"loss": 0.3567,
"step": 10450
},
{
"epoch": 23.808607021517552,
"grad_norm": 1.3822436332702637,
"learning_rate": 9.622555707139608e-06,
"loss": 0.3655,
"step": 10500
},
{
"epoch": 23.808607021517552,
"eval_loss": 0.6519103050231934,
"eval_runtime": 219.1979,
"eval_samples_per_second": 7.158,
"eval_steps_per_second": 0.899,
"step": 10500
},
{
"epoch": 23.92185730464326,
"grad_norm": 1.49004065990448,
"learning_rate": 8.713051386994087e-06,
"loss": 0.356,
"step": 10550
},
{
"epoch": 24.036240090600227,
"grad_norm": 1.3333971500396729,
"learning_rate": 7.803547066848568e-06,
"loss": 0.3609,
"step": 10600
},
{
"epoch": 24.036240090600227,
"eval_loss": 0.6471272706985474,
"eval_runtime": 219.2457,
"eval_samples_per_second": 7.156,
"eval_steps_per_second": 0.899,
"step": 10600
},
{
"epoch": 24.149490373725936,
"grad_norm": 1.3648090362548828,
"learning_rate": 6.894042746703047e-06,
"loss": 0.3445,
"step": 10650
},
{
"epoch": 24.26274065685164,
"grad_norm": 1.2211579084396362,
"learning_rate": 5.984538426557527e-06,
"loss": 0.3438,
"step": 10700
},
{
"epoch": 24.26274065685164,
"eval_loss": 0.6461014151573181,
"eval_runtime": 219.1852,
"eval_samples_per_second": 7.158,
"eval_steps_per_second": 0.899,
"step": 10700
},
{
"epoch": 24.37599093997735,
"grad_norm": 1.332571029663086,
"learning_rate": 5.075034106412006e-06,
"loss": 0.3378,
"step": 10750
},
{
"epoch": 24.489241223103058,
"grad_norm": 1.263708233833313,
"learning_rate": 4.1655297862664855e-06,
"loss": 0.3457,
"step": 10800
},
{
"epoch": 24.489241223103058,
"eval_loss": 0.6429575681686401,
"eval_runtime": 219.1956,
"eval_samples_per_second": 7.158,
"eval_steps_per_second": 0.899,
"step": 10800
},
{
"epoch": 24.602491506228766,
"grad_norm": 1.2414239645004272,
"learning_rate": 3.256025466120964e-06,
"loss": 0.3478,
"step": 10850
},
{
"epoch": 24.715741789354475,
"grad_norm": 1.183813214302063,
"learning_rate": 2.3465211459754434e-06,
"loss": 0.3413,
"step": 10900
},
{
"epoch": 24.715741789354475,
"eval_loss": 0.6409078240394592,
"eval_runtime": 219.161,
"eval_samples_per_second": 7.159,
"eval_steps_per_second": 0.899,
"step": 10900
},
{
"epoch": 24.82899207248018,
"grad_norm": 1.3728307485580444,
"learning_rate": 1.4370168258299228e-06,
"loss": 0.3453,
"step": 10950
},
{
"epoch": 24.942242355605888,
"grad_norm": 1.182039499282837,
"learning_rate": 5.275125056844021e-07,
"loss": 0.3439,
"step": 11000
},
{
"epoch": 24.942242355605888,
"eval_loss": 0.6399772763252258,
"eval_runtime": 219.2001,
"eval_samples_per_second": 7.158,
"eval_steps_per_second": 0.899,
"step": 11000
},
{
"epoch": 25.056625141562854,
"grad_norm": 1.50559401512146,
"learning_rate": 1.4498610760293005e-05,
"loss": 0.3502,
"step": 11050
},
{
"epoch": 25.169875424688563,
"grad_norm": 1.501145839691162,
"learning_rate": 1.3656647301507115e-05,
"loss": 0.3373,
"step": 11100
},
{
"epoch": 25.169875424688563,
"eval_loss": 0.6438981294631958,
"eval_runtime": 219.2088,
"eval_samples_per_second": 7.158,
"eval_steps_per_second": 0.899,
"step": 11100
},
{
"epoch": 25.28312570781427,
"grad_norm": 2.4662117958068848,
"learning_rate": 1.2814683842721226e-05,
"loss": 0.3419,
"step": 11150
},
{
"epoch": 25.396375990939976,
"grad_norm": 1.5162239074707031,
"learning_rate": 1.1972720383935337e-05,
"loss": 0.3452,
"step": 11200
},
{
"epoch": 25.396375990939976,
"eval_loss": 0.6388878226280212,
"eval_runtime": 219.2066,
"eval_samples_per_second": 7.158,
"eval_steps_per_second": 0.899,
"step": 11200
},
{
"epoch": 25.509626274065685,
"grad_norm": 1.315088152885437,
"learning_rate": 1.113075692514945e-05,
"loss": 0.3487,
"step": 11250
},
{
"epoch": 25.622876557191393,
"grad_norm": 1.4352425336837769,
"learning_rate": 1.028879346636356e-05,
"loss": 0.3386,
"step": 11300
},
{
"epoch": 25.622876557191393,
"eval_loss": 0.6349427700042725,
"eval_runtime": 219.2727,
"eval_samples_per_second": 7.155,
"eval_steps_per_second": 0.898,
"step": 11300
},
{
"epoch": 25.7361268403171,
"grad_norm": 1.433242678642273,
"learning_rate": 9.446830007577671e-06,
"loss": 0.3365,
"step": 11350
},
{
"epoch": 25.84937712344281,
"grad_norm": 1.343719720840454,
"learning_rate": 8.604866548791782e-06,
"loss": 0.3409,
"step": 11400
},
{
"epoch": 25.84937712344281,
"eval_loss": 0.631538987159729,
"eval_runtime": 219.1534,
"eval_samples_per_second": 7.159,
"eval_steps_per_second": 0.899,
"step": 11400
},
{
"epoch": 25.962627406568515,
"grad_norm": 1.496169090270996,
"learning_rate": 7.762903090005893e-06,
"loss": 0.3316,
"step": 11450
},
{
"epoch": 26.07701019252548,
"grad_norm": 1.5395649671554565,
"learning_rate": 6.920939631220005e-06,
"loss": 0.3429,
"step": 11500
},
{
"epoch": 26.07701019252548,
"eval_loss": 0.6306207180023193,
"eval_runtime": 219.1057,
"eval_samples_per_second": 7.161,
"eval_steps_per_second": 0.899,
"step": 11500
},
{
"epoch": 26.19026047565119,
"grad_norm": 1.298531413078308,
"learning_rate": 6.078976172434116e-06,
"loss": 0.3274,
"step": 11550
},
{
"epoch": 26.3035107587769,
"grad_norm": 1.3206506967544556,
"learning_rate": 5.237012713648228e-06,
"loss": 0.3281,
"step": 11600
},
{
"epoch": 26.3035107587769,
"eval_loss": 0.6274815797805786,
"eval_runtime": 219.1686,
"eval_samples_per_second": 7.159,
"eval_steps_per_second": 0.899,
"step": 11600
},
{
"epoch": 26.416761041902603,
"grad_norm": 1.3031998872756958,
"learning_rate": 4.395049254862339e-06,
"loss": 0.3186,
"step": 11650
},
{
"epoch": 26.530011325028312,
"grad_norm": 1.232765555381775,
"learning_rate": 3.5530857960764503e-06,
"loss": 0.324,
"step": 11700
},
{
"epoch": 26.530011325028312,
"eval_loss": 0.6246664524078369,
"eval_runtime": 219.2008,
"eval_samples_per_second": 7.158,
"eval_steps_per_second": 0.899,
"step": 11700
},
{
"epoch": 26.64326160815402,
"grad_norm": 1.3108420372009277,
"learning_rate": 2.7111223372905617e-06,
"loss": 0.3265,
"step": 11750
},
{
"epoch": 26.75651189127973,
"grad_norm": 1.2938895225524902,
"learning_rate": 1.8691588785046728e-06,
"loss": 0.325,
"step": 11800
},
{
"epoch": 26.75651189127973,
"eval_loss": 0.6232734322547913,
"eval_runtime": 219.1916,
"eval_samples_per_second": 7.158,
"eval_steps_per_second": 0.899,
"step": 11800
},
{
"epoch": 26.869762174405437,
"grad_norm": 1.4028679132461548,
"learning_rate": 1.0271954197187842e-06,
"loss": 0.3177,
"step": 11850
},
{
"epoch": 26.983012457531142,
"grad_norm": 1.1903717517852783,
"learning_rate": 1.8523196093289553e-07,
"loss": 0.3282,
"step": 11900
},
{
"epoch": 26.983012457531142,
"eval_loss": 0.6224809885025024,
"eval_runtime": 219.1746,
"eval_samples_per_second": 7.159,
"eval_steps_per_second": 0.899,
"step": 11900
},
{
"epoch": 27.09739524348811,
"grad_norm": 1.5629881620407104,
"learning_rate": 1.9454545454545457e-05,
"loss": 0.322,
"step": 11950
},
{
"epoch": 27.210645526613817,
"grad_norm": 1.8830535411834717,
"learning_rate": 1.86969696969697e-05,
"loss": 0.3272,
"step": 12000
},
{
"epoch": 27.210645526613817,
"eval_loss": 0.6294634342193604,
"eval_runtime": 216.3539,
"eval_samples_per_second": 7.252,
"eval_steps_per_second": 0.911,
"step": 12000
},
{
"epoch": 27.323895809739525,
"grad_norm": 2.302112340927124,
"learning_rate": 1.793939393939394e-05,
"loss": 0.3343,
"step": 12050
},
{
"epoch": 27.43714609286523,
"grad_norm": 1.6443369388580322,
"learning_rate": 1.718181818181818e-05,
"loss": 0.3331,
"step": 12100
},
{
"epoch": 27.43714609286523,
"eval_loss": 0.6251102685928345,
"eval_runtime": 216.4341,
"eval_samples_per_second": 7.249,
"eval_steps_per_second": 0.91,
"step": 12100
},
{
"epoch": 27.55039637599094,
"grad_norm": 1.6903585195541382,
"learning_rate": 1.6424242424242424e-05,
"loss": 0.3338,
"step": 12150
},
{
"epoch": 27.663646659116647,
"grad_norm": 1.6333993673324585,
"learning_rate": 1.5666666666666667e-05,
"loss": 0.3293,
"step": 12200
},
{
"epoch": 27.663646659116647,
"eval_loss": 0.6229289174079895,
"eval_runtime": 216.5229,
"eval_samples_per_second": 7.246,
"eval_steps_per_second": 0.91,
"step": 12200
},
{
"epoch": 27.776896942242356,
"grad_norm": 1.7001616954803467,
"learning_rate": 1.4909090909090908e-05,
"loss": 0.3245,
"step": 12250
},
{
"epoch": 27.890147225368064,
"grad_norm": 1.919396162033081,
"learning_rate": 1.4151515151515152e-05,
"loss": 0.3284,
"step": 12300
},
{
"epoch": 27.890147225368064,
"eval_loss": 0.6198203563690186,
"eval_runtime": 216.5445,
"eval_samples_per_second": 7.246,
"eval_steps_per_second": 0.91,
"step": 12300
},
{
"epoch": 28.004530011325027,
"grad_norm": 2.137244462966919,
"learning_rate": 1.3393939393939395e-05,
"loss": 0.3363,
"step": 12350
},
{
"epoch": 28.117780294450736,
"grad_norm": 2.0852112770080566,
"learning_rate": 1.2636363636363638e-05,
"loss": 0.31,
"step": 12400
},
{
"epoch": 28.117780294450736,
"eval_loss": 0.6158381104469299,
"eval_runtime": 216.498,
"eval_samples_per_second": 7.247,
"eval_steps_per_second": 0.91,
"step": 12400
},
{
"epoch": 28.231030577576444,
"grad_norm": 1.7770031690597534,
"learning_rate": 1.187878787878788e-05,
"loss": 0.3094,
"step": 12450
},
{
"epoch": 28.344280860702153,
"grad_norm": 2.2683119773864746,
"learning_rate": 1.1136363636363637e-05,
"loss": 0.3106,
"step": 12500
},
{
"epoch": 28.344280860702153,
"eval_loss": 0.6127829551696777,
"eval_runtime": 216.6692,
"eval_samples_per_second": 7.241,
"eval_steps_per_second": 0.909,
"step": 12500
},
{
"epoch": 28.45753114382786,
"grad_norm": 2.2971391677856445,
"learning_rate": 1.037878787878788e-05,
"loss": 0.3123,
"step": 12550
},
{
"epoch": 28.570781426953566,
"grad_norm": 1.5072888135910034,
"learning_rate": 9.62121212121212e-06,
"loss": 0.3126,
"step": 12600
},
{
"epoch": 28.570781426953566,
"eval_loss": 0.6085474491119385,
"eval_runtime": 216.6624,
"eval_samples_per_second": 7.242,
"eval_steps_per_second": 0.909,
"step": 12600
},
{
"epoch": 28.684031710079275,
"grad_norm": 1.9965884685516357,
"learning_rate": 8.863636363636365e-06,
"loss": 0.3159,
"step": 12650
},
{
"epoch": 28.797281993204983,
"grad_norm": 1.9271585941314697,
"learning_rate": 8.106060606060606e-06,
"loss": 0.317,
"step": 12700
},
{
"epoch": 28.797281993204983,
"eval_loss": 0.6035783886909485,
"eval_runtime": 216.6906,
"eval_samples_per_second": 7.241,
"eval_steps_per_second": 0.909,
"step": 12700
},
{
"epoch": 28.91053227633069,
"grad_norm": 1.6005176305770874,
"learning_rate": 7.3484848484848486e-06,
"loss": 0.3063,
"step": 12750
},
{
"epoch": 29.024915062287654,
"grad_norm": 1.3837414979934692,
"learning_rate": 6.59090909090909e-06,
"loss": 0.3149,
"step": 12800
},
{
"epoch": 29.024915062287654,
"eval_loss": 0.6015561819076538,
"eval_runtime": 216.6327,
"eval_samples_per_second": 7.243,
"eval_steps_per_second": 0.909,
"step": 12800
},
{
"epoch": 29.138165345413363,
"grad_norm": 1.3116227388381958,
"learning_rate": 5.833333333333334e-06,
"loss": 0.2962,
"step": 12850
},
{
"epoch": 29.25141562853907,
"grad_norm": 1.3354703187942505,
"learning_rate": 5.075757575757576e-06,
"loss": 0.2966,
"step": 12900
},
{
"epoch": 29.25141562853907,
"eval_loss": 0.5984891653060913,
"eval_runtime": 216.6939,
"eval_samples_per_second": 7.241,
"eval_steps_per_second": 0.909,
"step": 12900
},
{
"epoch": 29.36466591166478,
"grad_norm": 1.1777273416519165,
"learning_rate": 4.3181818181818185e-06,
"loss": 0.2989,
"step": 12950
},
{
"epoch": 29.477916194790488,
"grad_norm": 1.9163764715194702,
"learning_rate": 3.5606060606060608e-06,
"loss": 0.3063,
"step": 13000
},
{
"epoch": 29.477916194790488,
"eval_loss": 0.5958673357963562,
"eval_runtime": 216.5489,
"eval_samples_per_second": 7.245,
"eval_steps_per_second": 0.91,
"step": 13000
},
{
"epoch": 29.591166477916193,
"grad_norm": 1.3537064790725708,
"learning_rate": 2.803030303030303e-06,
"loss": 0.2951,
"step": 13050
},
{
"epoch": 29.7044167610419,
"grad_norm": 1.3078798055648804,
"learning_rate": 2.0454545454545457e-06,
"loss": 0.2963,
"step": 13100
},
{
"epoch": 29.7044167610419,
"eval_loss": 0.5946142077445984,
"eval_runtime": 216.5944,
"eval_samples_per_second": 7.244,
"eval_steps_per_second": 0.91,
"step": 13100
},
{
"epoch": 29.81766704416761,
"grad_norm": 1.289014458656311,
"learning_rate": 1.287878787878788e-06,
"loss": 0.2959,
"step": 13150
},
{
"epoch": 29.93091732729332,
"grad_norm": 1.3634095191955566,
"learning_rate": 5.303030303030304e-07,
"loss": 0.2942,
"step": 13200
},
{
"epoch": 29.93091732729332,
"eval_loss": 0.5935017466545105,
"eval_runtime": 216.5971,
"eval_samples_per_second": 7.244,
"eval_steps_per_second": 0.91,
"step": 13200
},
{
"epoch": 30.045300113250285,
"grad_norm": 1.3688397407531738,
"learning_rate": 1.8012807271224955e-05,
"loss": 0.2984,
"step": 13250
},
{
"epoch": 30.15855039637599,
"grad_norm": 1.7011109590530396,
"learning_rate": 1.7324244302141433e-05,
"loss": 0.2943,
"step": 13300
},
{
"epoch": 30.15855039637599,
"eval_loss": 0.6008950471878052,
"eval_runtime": 216.6155,
"eval_samples_per_second": 7.243,
"eval_steps_per_second": 0.909,
"step": 13300
},
{
"epoch": 30.2718006795017,
"grad_norm": 1.7913622856140137,
"learning_rate": 1.663568133305791e-05,
"loss": 0.2931,
"step": 13350
},
{
"epoch": 30.385050962627407,
"grad_norm": 1.8850469589233398,
"learning_rate": 1.5947118363974385e-05,
"loss": 0.305,
"step": 13400
},
{
"epoch": 30.385050962627407,
"eval_loss": 0.5989060997962952,
"eval_runtime": 216.6435,
"eval_samples_per_second": 7.242,
"eval_steps_per_second": 0.909,
"step": 13400
},
{
"epoch": 30.498301245753115,
"grad_norm": 1.997729778289795,
"learning_rate": 1.5258555394890863e-05,
"loss": 0.3083,
"step": 13450
},
{
"epoch": 30.611551528878824,
"grad_norm": 1.8760637044906616,
"learning_rate": 1.4569992425807341e-05,
"loss": 0.3006,
"step": 13500
},
{
"epoch": 30.611551528878824,
"eval_loss": 0.596034824848175,
"eval_runtime": 216.6232,
"eval_samples_per_second": 7.243,
"eval_steps_per_second": 0.909,
"step": 13500
},
{
"epoch": 30.72480181200453,
"grad_norm": 1.8395705223083496,
"learning_rate": 1.388142945672382e-05,
"loss": 0.3057,
"step": 13550
},
{
"epoch": 30.838052095130237,
"grad_norm": 1.8442955017089844,
"learning_rate": 1.3192866487640296e-05,
"loss": 0.3038,
"step": 13600
},
{
"epoch": 30.838052095130237,
"eval_loss": 0.5910864472389221,
"eval_runtime": 216.6693,
"eval_samples_per_second": 7.241,
"eval_steps_per_second": 0.909,
"step": 13600
},
{
"epoch": 30.951302378255946,
"grad_norm": 1.9047316312789917,
"learning_rate": 1.2504303518556774e-05,
"loss": 0.2949,
"step": 13650
},
{
"epoch": 31.065685164212912,
"grad_norm": 1.8259665966033936,
"learning_rate": 1.181574054947325e-05,
"loss": 0.2984,
"step": 13700
},
{
"epoch": 31.065685164212912,
"eval_loss": 0.5886039733886719,
"eval_runtime": 216.6071,
"eval_samples_per_second": 7.244,
"eval_steps_per_second": 0.909,
"step": 13700
},
{
"epoch": 31.178935447338617,
"grad_norm": 1.9464973211288452,
"learning_rate": 1.1127177580389728e-05,
"loss": 0.2915,
"step": 13750
},
{
"epoch": 31.292185730464325,
"grad_norm": 1.4512701034545898,
"learning_rate": 1.0438614611306204e-05,
"loss": 0.2865,
"step": 13800
},
{
"epoch": 31.292185730464325,
"eval_loss": 0.5855095982551575,
"eval_runtime": 216.5764,
"eval_samples_per_second": 7.245,
"eval_steps_per_second": 0.91,
"step": 13800
},
{
"epoch": 31.405436013590034,
"grad_norm": 1.6476430892944336,
"learning_rate": 9.750051642222682e-06,
"loss": 0.2794,
"step": 13850
},
{
"epoch": 31.518686296715742,
"grad_norm": 2.3963589668273926,
"learning_rate": 9.06148867313916e-06,
"loss": 0.2958,
"step": 13900
},
{
"epoch": 31.518686296715742,
"eval_loss": 0.5817484259605408,
"eval_runtime": 216.6161,
"eval_samples_per_second": 7.243,
"eval_steps_per_second": 0.909,
"step": 13900
},
{
"epoch": 31.63193657984145,
"grad_norm": 1.6295278072357178,
"learning_rate": 8.372925704055636e-06,
"loss": 0.2842,
"step": 13950
},
{
"epoch": 31.745186862967156,
"grad_norm": 1.7011767625808716,
"learning_rate": 7.684362734972115e-06,
"loss": 0.2853,
"step": 14000
},
{
"epoch": 31.745186862967156,
"eval_loss": 0.5777027010917664,
"eval_runtime": 216.6548,
"eval_samples_per_second": 7.242,
"eval_steps_per_second": 0.909,
"step": 14000
},
{
"epoch": 31.858437146092864,
"grad_norm": 1.2951115369796753,
"learning_rate": 6.995799765888592e-06,
"loss": 0.2822,
"step": 14050
},
{
"epoch": 31.971687429218573,
"grad_norm": 1.6724634170532227,
"learning_rate": 6.307236796805067e-06,
"loss": 0.282,
"step": 14100
},
{
"epoch": 31.971687429218573,
"eval_loss": 0.5746533274650574,
"eval_runtime": 216.6678,
"eval_samples_per_second": 7.242,
"eval_steps_per_second": 0.909,
"step": 14100
},
{
"epoch": 32.08607021517554,
"grad_norm": 1.1698694229125977,
"learning_rate": 5.618673827721545e-06,
"loss": 0.2858,
"step": 14150
},
{
"epoch": 32.19932049830125,
"grad_norm": 1.4823814630508423,
"learning_rate": 4.930110858638023e-06,
"loss": 0.2724,
"step": 14200
},
{
"epoch": 32.19932049830125,
"eval_loss": 0.5733225345611572,
"eval_runtime": 216.6993,
"eval_samples_per_second": 7.24,
"eval_steps_per_second": 0.909,
"step": 14200
},
{
"epoch": 32.312570781426956,
"grad_norm": 1.2654746770858765,
"learning_rate": 4.2415478895545e-06,
"loss": 0.2668,
"step": 14250
},
{
"epoch": 32.425821064552665,
"grad_norm": 1.390316367149353,
"learning_rate": 3.5529849204709775e-06,
"loss": 0.2732,
"step": 14300
},
{
"epoch": 32.425821064552665,
"eval_loss": 0.5705001354217529,
"eval_runtime": 216.6456,
"eval_samples_per_second": 7.242,
"eval_steps_per_second": 0.909,
"step": 14300
},
{
"epoch": 32.539071347678366,
"grad_norm": 1.0841820240020752,
"learning_rate": 2.864421951387454e-06,
"loss": 0.2757,
"step": 14350
},
{
"epoch": 32.652321630804074,
"grad_norm": 1.1355277299880981,
"learning_rate": 2.175858982303932e-06,
"loss": 0.2726,
"step": 14400
},
{
"epoch": 32.652321630804074,
"eval_loss": 0.5688679814338684,
"eval_runtime": 216.6214,
"eval_samples_per_second": 7.243,
"eval_steps_per_second": 0.909,
"step": 14400
},
{
"epoch": 32.76557191392978,
"grad_norm": 1.3108878135681152,
"learning_rate": 1.4872960132204092e-06,
"loss": 0.273,
"step": 14450
},
{
"epoch": 32.87882219705549,
"grad_norm": 1.175482153892517,
"learning_rate": 7.987330441368863e-07,
"loss": 0.2695,
"step": 14500
},
{
"epoch": 32.87882219705549,
"eval_loss": 0.567724347114563,
"eval_runtime": 216.7209,
"eval_samples_per_second": 7.24,
"eval_steps_per_second": 0.909,
"step": 14500
},
{
"epoch": 32.9920724801812,
"grad_norm": 1.3629848957061768,
"learning_rate": 1.1017007505336364e-07,
"loss": 0.2808,
"step": 14550
},
{
"epoch": 33.10645526613816,
"grad_norm": 3.8983519077301025,
"learning_rate": 5.299015897047691e-05,
"loss": 0.2925,
"step": 14600
},
{
"epoch": 33.10645526613816,
"eval_loss": 0.6180706024169922,
"eval_runtime": 217.7372,
"eval_samples_per_second": 7.206,
"eval_steps_per_second": 0.905,
"step": 14600
},
{
"epoch": 33.21970554926387,
"grad_norm": 3.372912645339966,
"learning_rate": 5.249558415341913e-05,
"loss": 0.3275,
"step": 14650
},
{
"epoch": 33.33295583238958,
"grad_norm": 3.959416389465332,
"learning_rate": 5.199091597274792e-05,
"loss": 0.341,
"step": 14700
},
{
"epoch": 33.33295583238958,
"eval_loss": 0.6353843808174133,
"eval_runtime": 217.8515,
"eval_samples_per_second": 7.202,
"eval_steps_per_second": 0.904,
"step": 14700
},
{
"epoch": 33.44620611551529,
"grad_norm": 3.4942378997802734,
"learning_rate": 5.1486247792076715e-05,
"loss": 0.3485,
"step": 14750
},
{
"epoch": 33.559456398641,
"grad_norm": 3.3839058876037598,
"learning_rate": 5.098157961140551e-05,
"loss": 0.3442,
"step": 14800
},
{
"epoch": 33.559456398641,
"eval_loss": 0.6342476606369019,
"eval_runtime": 217.9772,
"eval_samples_per_second": 7.198,
"eval_steps_per_second": 0.904,
"step": 14800
},
{
"epoch": 33.672706681766705,
"grad_norm": 3.631831407546997,
"learning_rate": 5.04769114307343e-05,
"loss": 0.3505,
"step": 14850
},
{
"epoch": 33.785956964892414,
"grad_norm": 3.05415678024292,
"learning_rate": 4.9972243250063086e-05,
"loss": 0.342,
"step": 14900
},
{
"epoch": 33.785956964892414,
"eval_loss": 0.6282561421394348,
"eval_runtime": 218.0099,
"eval_samples_per_second": 7.197,
"eval_steps_per_second": 0.904,
"step": 14900
},
{
"epoch": 33.89920724801812,
"grad_norm": 3.213174343109131,
"learning_rate": 4.946757506939187e-05,
"loss": 0.3526,
"step": 14950
},
{
"epoch": 34.013590033975085,
"grad_norm": 2.7019898891448975,
"learning_rate": 4.8962906888720665e-05,
"loss": 0.3596,
"step": 15000
},
{
"epoch": 34.013590033975085,
"eval_loss": 0.6229637265205383,
"eval_runtime": 217.9233,
"eval_samples_per_second": 7.2,
"eval_steps_per_second": 0.904,
"step": 15000
},
{
"epoch": 34.12684031710079,
"grad_norm": 3.5813961029052734,
"learning_rate": 4.845823870804946e-05,
"loss": 0.3202,
"step": 15050
},
{
"epoch": 34.2400906002265,
"grad_norm": 2.996546983718872,
"learning_rate": 4.795357052737825e-05,
"loss": 0.3208,
"step": 15100
},
{
"epoch": 34.2400906002265,
"eval_loss": 0.6200416684150696,
"eval_runtime": 217.9712,
"eval_samples_per_second": 7.198,
"eval_steps_per_second": 0.904,
"step": 15100
},
{
"epoch": 34.35334088335221,
"grad_norm": 3.248931407928467,
"learning_rate": 4.744890234670704e-05,
"loss": 0.3191,
"step": 15150
},
{
"epoch": 34.46659116647792,
"grad_norm": 2.503894805908203,
"learning_rate": 4.6944234166035835e-05,
"loss": 0.3206,
"step": 15200
},
{
"epoch": 34.46659116647792,
"eval_loss": 0.6110924482345581,
"eval_runtime": 217.9474,
"eval_samples_per_second": 7.199,
"eval_steps_per_second": 0.904,
"step": 15200
},
{
"epoch": 34.57984144960363,
"grad_norm": 3.3107473850250244,
"learning_rate": 4.643956598536463e-05,
"loss": 0.3198,
"step": 15250
},
{
"epoch": 34.69309173272933,
"grad_norm": 2.6435258388519287,
"learning_rate": 4.5934897804693414e-05,
"loss": 0.3261,
"step": 15300
},
{
"epoch": 34.69309173272933,
"eval_loss": 0.603391706943512,
"eval_runtime": 217.9424,
"eval_samples_per_second": 7.199,
"eval_steps_per_second": 0.904,
"step": 15300
},
{
"epoch": 34.80634201585504,
"grad_norm": 3.1980810165405273,
"learning_rate": 4.5430229624022207e-05,
"loss": 0.3216,
"step": 15350
},
{
"epoch": 34.919592298980746,
"grad_norm": 2.4994754791259766,
"learning_rate": 4.4925561443351e-05,
"loss": 0.3168,
"step": 15400
},
{
"epoch": 34.919592298980746,
"eval_loss": 0.5940945148468018,
"eval_runtime": 218.056,
"eval_samples_per_second": 7.195,
"eval_steps_per_second": 0.903,
"step": 15400
},
{
"epoch": 35.033975084937715,
"grad_norm": 2.75138521194458,
"learning_rate": 4.442089326267979e-05,
"loss": 0.3191,
"step": 15450
},
{
"epoch": 35.14722536806342,
"grad_norm": 3.1039974689483643,
"learning_rate": 4.3916225082008585e-05,
"loss": 0.296,
"step": 15500
},
{
"epoch": 35.14722536806342,
"eval_loss": 0.5926975607872009,
"eval_runtime": 218.0089,
"eval_samples_per_second": 7.197,
"eval_steps_per_second": 0.904,
"step": 15500
},
{
"epoch": 35.260475651189125,
"grad_norm": 2.9686388969421387,
"learning_rate": 4.341155690133738e-05,
"loss": 0.2921,
"step": 15550
},
{
"epoch": 35.373725934314834,
"grad_norm": 2.5670547485351562,
"learning_rate": 4.290688872066616e-05,
"loss": 0.2909,
"step": 15600
},
{
"epoch": 35.373725934314834,
"eval_loss": 0.5892407894134521,
"eval_runtime": 217.8483,
"eval_samples_per_second": 7.202,
"eval_steps_per_second": 0.904,
"step": 15600
},
{
"epoch": 35.48697621744054,
"grad_norm": 2.28952956199646,
"learning_rate": 4.2402220539994956e-05,
"loss": 0.2947,
"step": 15650
},
{
"epoch": 35.60022650056625,
"grad_norm": 2.401625394821167,
"learning_rate": 4.189755235932374e-05,
"loss": 0.2915,
"step": 15700
},
{
"epoch": 35.60022650056625,
"eval_loss": 0.5815189480781555,
"eval_runtime": 217.8623,
"eval_samples_per_second": 7.202,
"eval_steps_per_second": 0.904,
"step": 15700
},
{
"epoch": 35.71347678369196,
"grad_norm": 2.7113890647888184,
"learning_rate": 4.1392884178652534e-05,
"loss": 0.2908,
"step": 15750
},
{
"epoch": 35.82672706681767,
"grad_norm": 2.949303388595581,
"learning_rate": 4.088821599798133e-05,
"loss": 0.2942,
"step": 15800
},
{
"epoch": 35.82672706681767,
"eval_loss": 0.5712306499481201,
"eval_runtime": 217.792,
"eval_samples_per_second": 7.204,
"eval_steps_per_second": 0.905,
"step": 15800
},
{
"epoch": 35.939977349943376,
"grad_norm": 2.3547251224517822,
"learning_rate": 4.038354781731012e-05,
"loss": 0.2854,
"step": 15850
},
{
"epoch": 36.05436013590034,
"grad_norm": 2.6130595207214355,
"learning_rate": 3.987887963663891e-05,
"loss": 0.2877,
"step": 15900
},
{
"epoch": 36.05436013590034,
"eval_loss": 0.5668493509292603,
"eval_runtime": 217.7584,
"eval_samples_per_second": 7.205,
"eval_steps_per_second": 0.905,
"step": 15900
},
{
"epoch": 36.16761041902605,
"grad_norm": 2.4720046520233154,
"learning_rate": 3.9374211455967705e-05,
"loss": 0.272,
"step": 15950
},
{
"epoch": 36.280860702151756,
"grad_norm": 3.291337490081787,
"learning_rate": 3.886954327529649e-05,
"loss": 0.2756,
"step": 16000
},
{
"epoch": 36.280860702151756,
"eval_loss": 0.5569508075714111,
"eval_runtime": 217.7243,
"eval_samples_per_second": 7.206,
"eval_steps_per_second": 0.905,
"step": 16000
},
{
"epoch": 36.394110985277464,
"grad_norm": 2.275122880935669,
"learning_rate": 3.8364875094625284e-05,
"loss": 0.2699,
"step": 16050
},
{
"epoch": 36.50736126840317,
"grad_norm": 2.351252317428589,
"learning_rate": 3.7860206913954076e-05,
"loss": 0.263,
"step": 16100
},
{
"epoch": 36.50736126840317,
"eval_loss": 0.552777886390686,
"eval_runtime": 217.6644,
"eval_samples_per_second": 7.208,
"eval_steps_per_second": 0.905,
"step": 16100
},
{
"epoch": 36.62061155152888,
"grad_norm": 2.0470945835113525,
"learning_rate": 3.735553873328287e-05,
"loss": 0.2605,
"step": 16150
},
{
"epoch": 36.73386183465459,
"grad_norm": 2.258258819580078,
"learning_rate": 3.685087055261166e-05,
"loss": 0.2621,
"step": 16200
},
{
"epoch": 36.73386183465459,
"eval_loss": 0.548316478729248,
"eval_runtime": 217.7617,
"eval_samples_per_second": 7.205,
"eval_steps_per_second": 0.905,
"step": 16200
},
{
"epoch": 36.84711211778029,
"grad_norm": 2.473788261413574,
"learning_rate": 3.6346202371940454e-05,
"loss": 0.2606,
"step": 16250
},
{
"epoch": 36.960362400906,
"grad_norm": 2.4730281829833984,
"learning_rate": 3.584153419126925e-05,
"loss": 0.2674,
"step": 16300
},
{
"epoch": 36.960362400906,
"eval_loss": 0.5399536490440369,
"eval_runtime": 217.745,
"eval_samples_per_second": 7.206,
"eval_steps_per_second": 0.905,
"step": 16300
},
{
"epoch": 37.07474518686297,
"grad_norm": 2.3119349479675293,
"learning_rate": 3.533686601059803e-05,
"loss": 0.258,
"step": 16350
},
{
"epoch": 37.18799546998868,
"grad_norm": 2.451964855194092,
"learning_rate": 3.4832197829926826e-05,
"loss": 0.2452,
"step": 16400
},
{
"epoch": 37.18799546998868,
"eval_loss": 0.5389652252197266,
"eval_runtime": 217.7139,
"eval_samples_per_second": 7.207,
"eval_steps_per_second": 0.905,
"step": 16400
},
{
"epoch": 37.30124575311438,
"grad_norm": 2.2861897945404053,
"learning_rate": 3.432752964925562e-05,
"loss": 0.2483,
"step": 16450
},
{
"epoch": 37.41449603624009,
"grad_norm": 1.7861238718032837,
"learning_rate": 3.3822861468584404e-05,
"loss": 0.2493,
"step": 16500
},
{
"epoch": 37.41449603624009,
"eval_loss": 0.5293774604797363,
"eval_runtime": 217.8898,
"eval_samples_per_second": 7.201,
"eval_steps_per_second": 0.904,
"step": 16500
},
{
"epoch": 37.5277463193658,
"grad_norm": 2.2910056114196777,
"learning_rate": 3.33181932879132e-05,
"loss": 0.2449,
"step": 16550
},
{
"epoch": 37.640996602491505,
"grad_norm": 2.102193832397461,
"learning_rate": 3.281352510724199e-05,
"loss": 0.2398,
"step": 16600
},
{
"epoch": 37.640996602491505,
"eval_loss": 0.5246281027793884,
"eval_runtime": 217.7811,
"eval_samples_per_second": 7.204,
"eval_steps_per_second": 0.905,
"step": 16600
},
{
"epoch": 37.75424688561721,
"grad_norm": 2.1423254013061523,
"learning_rate": 3.230885692657078e-05,
"loss": 0.2438,
"step": 16650
},
{
"epoch": 37.86749716874292,
"grad_norm": 2.031027317047119,
"learning_rate": 3.180418874589957e-05,
"loss": 0.2427,
"step": 16700
},
{
"epoch": 37.86749716874292,
"eval_loss": 0.5190041661262512,
"eval_runtime": 217.6886,
"eval_samples_per_second": 7.208,
"eval_steps_per_second": 0.905,
"step": 16700
},
{
"epoch": 37.98074745186863,
"grad_norm": 1.8530203104019165,
"learning_rate": 3.129952056522836e-05,
"loss": 0.2446,
"step": 16750
},
{
"epoch": 38.09513023782559,
"grad_norm": 1.9591715335845947,
"learning_rate": 3.0794852384557153e-05,
"loss": 0.2288,
"step": 16800
},
{
"epoch": 38.09513023782559,
"eval_loss": 0.5154264569282532,
"eval_runtime": 217.6837,
"eval_samples_per_second": 7.208,
"eval_steps_per_second": 0.905,
"step": 16800
},
{
"epoch": 38.2083805209513,
"grad_norm": 1.752700686454773,
"learning_rate": 3.0290184203885946e-05,
"loss": 0.2249,
"step": 16850
},
{
"epoch": 38.32163080407701,
"grad_norm": 1.7865016460418701,
"learning_rate": 2.978551602321474e-05,
"loss": 0.2254,
"step": 16900
},
{
"epoch": 38.32163080407701,
"eval_loss": 0.510138750076294,
"eval_runtime": 217.7044,
"eval_samples_per_second": 7.207,
"eval_steps_per_second": 0.905,
"step": 16900
},
{
"epoch": 38.43488108720272,
"grad_norm": 1.851835012435913,
"learning_rate": 2.9280847842543528e-05,
"loss": 0.2255,
"step": 16950
},
{
"epoch": 38.54813137032843,
"grad_norm": 1.7320882081985474,
"learning_rate": 2.877617966187232e-05,
"loss": 0.227,
"step": 17000
},
{
"epoch": 38.54813137032843,
"eval_loss": 0.5055522322654724,
"eval_runtime": 217.6805,
"eval_samples_per_second": 7.208,
"eval_steps_per_second": 0.905,
"step": 17000
},
{
"epoch": 38.661381653454136,
"grad_norm": 2.6240079402923584,
"learning_rate": 2.8271511481201113e-05,
"loss": 0.2227,
"step": 17050
},
{
"epoch": 38.774631936579844,
"grad_norm": 1.8069425821304321,
"learning_rate": 2.7766843300529906e-05,
"loss": 0.223,
"step": 17100
},
{
"epoch": 38.774631936579844,
"eval_loss": 0.49882474541664124,
"eval_runtime": 217.6912,
"eval_samples_per_second": 7.207,
"eval_steps_per_second": 0.905,
"step": 17100
},
{
"epoch": 38.88788221970555,
"grad_norm": 1.8260191679000854,
"learning_rate": 2.7262175119858695e-05,
"loss": 0.2239,
"step": 17150
},
{
"epoch": 39.002265005662515,
"grad_norm": 5.091439723968506,
"learning_rate": 2.6757506939187488e-05,
"loss": 0.2295,
"step": 17200
},
{
"epoch": 39.002265005662515,
"eval_loss": 0.49227145314216614,
"eval_runtime": 217.7384,
"eval_samples_per_second": 7.206,
"eval_steps_per_second": 0.905,
"step": 17200
},
{
"epoch": 39.115515288788224,
"grad_norm": 2.467454433441162,
"learning_rate": 2.625283875851628e-05,
"loss": 0.2057,
"step": 17250
},
{
"epoch": 39.22876557191393,
"grad_norm": 1.6467406749725342,
"learning_rate": 2.5748170577845067e-05,
"loss": 0.2022,
"step": 17300
},
{
"epoch": 39.22876557191393,
"eval_loss": 0.49300825595855713,
"eval_runtime": 217.7288,
"eval_samples_per_second": 7.206,
"eval_steps_per_second": 0.905,
"step": 17300
},
{
"epoch": 39.34201585503964,
"grad_norm": 1.446031093597412,
"learning_rate": 2.5243502397173856e-05,
"loss": 0.2055,
"step": 17350
},
{
"epoch": 39.45526613816534,
"grad_norm": 1.6686514616012573,
"learning_rate": 2.4738834216502652e-05,
"loss": 0.2147,
"step": 17400
},
{
"epoch": 39.45526613816534,
"eval_loss": 0.485858678817749,
"eval_runtime": 217.207,
"eval_samples_per_second": 7.224,
"eval_steps_per_second": 0.907,
"step": 17400
},
{
"epoch": 39.56851642129105,
"grad_norm": 1.513580322265625,
"learning_rate": 2.4234166035831445e-05,
"loss": 0.2046,
"step": 17450
},
{
"epoch": 39.68176670441676,
"grad_norm": 1.5527840852737427,
"learning_rate": 2.372949785516023e-05,
"loss": 0.2039,
"step": 17500
},
{
"epoch": 39.68176670441676,
"eval_loss": 0.48166778683662415,
"eval_runtime": 217.3046,
"eval_samples_per_second": 7.22,
"eval_steps_per_second": 0.907,
"step": 17500
},
{
"epoch": 39.79501698754247,
"grad_norm": 1.5010417699813843,
"learning_rate": 2.3224829674489023e-05,
"loss": 0.21,
"step": 17550
},
{
"epoch": 39.908267270668176,
"grad_norm": 2.1489455699920654,
"learning_rate": 2.2720161493817816e-05,
"loss": 0.2042,
"step": 17600
},
{
"epoch": 39.908267270668176,
"eval_loss": 0.4749002754688263,
"eval_runtime": 217.3286,
"eval_samples_per_second": 7.219,
"eval_steps_per_second": 0.906,
"step": 17600
},
{
"epoch": 40.02265005662514,
"grad_norm": 1.933009147644043,
"learning_rate": 2.221549331314661e-05,
"loss": 0.2157,
"step": 17650
},
{
"epoch": 40.13590033975085,
"grad_norm": 1.5398054122924805,
"learning_rate": 2.1710825132475398e-05,
"loss": 0.1894,
"step": 17700
},
{
"epoch": 40.13590033975085,
"eval_loss": 0.4718286097049713,
"eval_runtime": 217.1637,
"eval_samples_per_second": 7.225,
"eval_steps_per_second": 0.907,
"step": 17700
},
{
"epoch": 40.249150622876556,
"grad_norm": 1.7476941347122192,
"learning_rate": 2.120615695180419e-05,
"loss": 0.1924,
"step": 17750
},
{
"epoch": 40.362400906002264,
"grad_norm": 1.5386378765106201,
"learning_rate": 2.0701488771132983e-05,
"loss": 0.1918,
"step": 17800
},
{
"epoch": 40.362400906002264,
"eval_loss": 0.46827250719070435,
"eval_runtime": 217.1473,
"eval_samples_per_second": 7.226,
"eval_steps_per_second": 0.907,
"step": 17800
},
{
"epoch": 40.47565118912797,
"grad_norm": 1.6006604433059692,
"learning_rate": 2.0196820590461773e-05,
"loss": 0.188,
"step": 17850
},
{
"epoch": 40.58890147225368,
"grad_norm": 1.5906981229782104,
"learning_rate": 1.9692152409790562e-05,
"loss": 0.1922,
"step": 17900
},
{
"epoch": 40.58890147225368,
"eval_loss": 0.4618977904319763,
"eval_runtime": 217.1294,
"eval_samples_per_second": 7.226,
"eval_steps_per_second": 0.907,
"step": 17900
},
{
"epoch": 40.70215175537939,
"grad_norm": 1.451889991760254,
"learning_rate": 1.9187484229119355e-05,
"loss": 0.1961,
"step": 17950
},
{
"epoch": 40.8154020385051,
"grad_norm": 1.2037873268127441,
"learning_rate": 1.8682816048448147e-05,
"loss": 0.1951,
"step": 18000
},
{
"epoch": 40.8154020385051,
"eval_loss": 0.45853978395462036,
"eval_runtime": 217.1867,
"eval_samples_per_second": 7.224,
"eval_steps_per_second": 0.907,
"step": 18000
},
{
"epoch": 40.92865232163081,
"grad_norm": 1.124363899230957,
"learning_rate": 1.8178147867776936e-05,
"loss": 0.1907,
"step": 18050
},
{
"epoch": 41.04303510758777,
"grad_norm": 1.1726500988006592,
"learning_rate": 1.767347968710573e-05,
"loss": 0.1893,
"step": 18100
},
{
"epoch": 41.04303510758777,
"eval_loss": 0.4549981355667114,
"eval_runtime": 217.2331,
"eval_samples_per_second": 7.223,
"eval_steps_per_second": 0.907,
"step": 18100
},
{
"epoch": 41.15628539071348,
"grad_norm": 1.6041500568389893,
"learning_rate": 1.7168811506434522e-05,
"loss": 0.1769,
"step": 18150
},
{
"epoch": 41.26953567383919,
"grad_norm": 1.9704344272613525,
"learning_rate": 1.666414332576331e-05,
"loss": 0.1798,
"step": 18200
},
{
"epoch": 41.26953567383919,
"eval_loss": 0.45383498072624207,
"eval_runtime": 217.2468,
"eval_samples_per_second": 7.222,
"eval_steps_per_second": 0.907,
"step": 18200
},
{
"epoch": 41.382785956964895,
"grad_norm": 1.1522181034088135,
"learning_rate": 1.6159475145092104e-05,
"loss": 0.1858,
"step": 18250
},
{
"epoch": 41.4960362400906,
"grad_norm": 1.6338062286376953,
"learning_rate": 1.5654806964420893e-05,
"loss": 0.1776,
"step": 18300
},
{
"epoch": 41.4960362400906,
"eval_loss": 0.448618620634079,
"eval_runtime": 217.2186,
"eval_samples_per_second": 7.223,
"eval_steps_per_second": 0.907,
"step": 18300
},
{
"epoch": 41.609286523216305,
"grad_norm": 1.1537904739379883,
"learning_rate": 1.5150138783749684e-05,
"loss": 0.1759,
"step": 18350
},
{
"epoch": 41.72253680634201,
"grad_norm": 1.285271406173706,
"learning_rate": 1.4645470603078477e-05,
"loss": 0.1794,
"step": 18400
},
{
"epoch": 41.72253680634201,
"eval_loss": 0.4447907507419586,
"eval_runtime": 217.1396,
"eval_samples_per_second": 7.226,
"eval_steps_per_second": 0.907,
"step": 18400
},
{
"epoch": 41.83578708946772,
"grad_norm": 1.125063419342041,
"learning_rate": 1.4140802422407268e-05,
"loss": 0.1756,
"step": 18450
},
{
"epoch": 41.94903737259343,
"grad_norm": 1.1060149669647217,
"learning_rate": 1.3636134241736059e-05,
"loss": 0.1787,
"step": 18500
},
{
"epoch": 41.94903737259343,
"eval_loss": 0.4420225918292999,
"eval_runtime": 217.4988,
"eval_samples_per_second": 7.214,
"eval_steps_per_second": 0.906,
"step": 18500
},
{
"epoch": 42.06342015855039,
"grad_norm": 1.0146502256393433,
"learning_rate": 1.3131466061064851e-05,
"loss": 0.1791,
"step": 18550
},
{
"epoch": 42.1766704416761,
"grad_norm": 1.1884300708770752,
"learning_rate": 1.2626797880393642e-05,
"loss": 0.1658,
"step": 18600
},
{
"epoch": 42.1766704416761,
"eval_loss": 0.4396124482154846,
"eval_runtime": 217.7883,
"eval_samples_per_second": 7.204,
"eval_steps_per_second": 0.905,
"step": 18600
},
{
"epoch": 42.28992072480181,
"grad_norm": 1.1497679948806763,
"learning_rate": 1.2122129699722433e-05,
"loss": 0.1696,
"step": 18650
},
{
"epoch": 42.40317100792752,
"grad_norm": 1.32937490940094,
"learning_rate": 1.1617461519051224e-05,
"loss": 0.1643,
"step": 18700
},
{
"epoch": 42.40317100792752,
"eval_loss": 0.43940281867980957,
"eval_runtime": 218.6239,
"eval_samples_per_second": 7.177,
"eval_steps_per_second": 0.901,
"step": 18700
},
{
"epoch": 42.51642129105323,
"grad_norm": 1.5960180759429932,
"learning_rate": 1.1112793338380017e-05,
"loss": 0.1699,
"step": 18750
},
{
"epoch": 42.629671574178936,
"grad_norm": 1.0415377616882324,
"learning_rate": 1.0608125157708806e-05,
"loss": 0.1654,
"step": 18800
},
{
"epoch": 42.629671574178936,
"eval_loss": 0.43373051285743713,
"eval_runtime": 218.5653,
"eval_samples_per_second": 7.179,
"eval_steps_per_second": 0.901,
"step": 18800
},
{
"epoch": 42.742921857304644,
"grad_norm": 1.5094951391220093,
"learning_rate": 1.0103456977037597e-05,
"loss": 0.1669,
"step": 18850
},
{
"epoch": 42.85617214043035,
"grad_norm": 0.9974751472473145,
"learning_rate": 9.59878879636639e-06,
"loss": 0.1681,
"step": 18900
},
{
"epoch": 42.85617214043035,
"eval_loss": 0.4303882420063019,
"eval_runtime": 218.6322,
"eval_samples_per_second": 7.176,
"eval_steps_per_second": 0.901,
"step": 18900
},
{
"epoch": 42.96942242355606,
"grad_norm": 0.9117754697799683,
"learning_rate": 9.094120615695181e-06,
"loss": 0.1706,
"step": 18950
},
{
"epoch": 43.083805209513024,
"grad_norm": 1.0373188257217407,
"learning_rate": 8.589452435023972e-06,
"loss": 0.1643,
"step": 19000
},
{
"epoch": 43.083805209513024,
"eval_loss": 0.42856693267822266,
"eval_runtime": 218.5993,
"eval_samples_per_second": 7.178,
"eval_steps_per_second": 0.901,
"step": 19000
},
{
"epoch": 43.19705549263873,
"grad_norm": 0.9998382329940796,
"learning_rate": 8.084784254352763e-06,
"loss": 0.1617,
"step": 19050
},
{
"epoch": 43.31030577576444,
"grad_norm": 0.9849778413772583,
"learning_rate": 7.580116073681555e-06,
"loss": 0.1603,
"step": 19100
},
{
"epoch": 43.31030577576444,
"eval_loss": 0.4269334077835083,
"eval_runtime": 218.6737,
"eval_samples_per_second": 7.175,
"eval_steps_per_second": 0.901,
"step": 19100
},
{
"epoch": 43.42355605889015,
"grad_norm": 1.2009530067443848,
"learning_rate": 7.0754478930103465e-06,
"loss": 0.157,
"step": 19150
},
{
"epoch": 43.53680634201586,
"grad_norm": 0.8868136405944824,
"learning_rate": 6.570779712339137e-06,
"loss": 0.1582,
"step": 19200
},
{
"epoch": 43.53680634201586,
"eval_loss": 0.42409417033195496,
"eval_runtime": 218.6076,
"eval_samples_per_second": 7.177,
"eval_steps_per_second": 0.901,
"step": 19200
},
{
"epoch": 43.650056625141566,
"grad_norm": 0.8435959815979004,
"learning_rate": 6.0661115316679285e-06,
"loss": 0.158,
"step": 19250
},
{
"epoch": 43.76330690826727,
"grad_norm": 1.1476356983184814,
"learning_rate": 5.56144335099672e-06,
"loss": 0.1608,
"step": 19300
},
{
"epoch": 43.76330690826727,
"eval_loss": 0.422664076089859,
"eval_runtime": 218.5875,
"eval_samples_per_second": 7.178,
"eval_steps_per_second": 0.901,
"step": 19300
},
{
"epoch": 43.876557191392976,
"grad_norm": 0.765332043170929,
"learning_rate": 5.056775170325511e-06,
"loss": 0.1606,
"step": 19350
},
{
"epoch": 43.989807474518685,
"grad_norm": 0.9879748821258545,
"learning_rate": 4.552106989654302e-06,
"loss": 0.1573,
"step": 19400
},
{
"epoch": 43.989807474518685,
"eval_loss": 0.4201904535293579,
"eval_runtime": 218.5744,
"eval_samples_per_second": 7.178,
"eval_steps_per_second": 0.901,
"step": 19400
},
{
"epoch": 44.104190260475654,
"grad_norm": 0.6540424227714539,
"learning_rate": 4.047438808983093e-06,
"loss": 0.1572,
"step": 19450
},
{
"epoch": 44.217440543601356,
"grad_norm": 0.9124572277069092,
"learning_rate": 3.542770628311885e-06,
"loss": 0.1498,
"step": 19500
},
{
"epoch": 44.217440543601356,
"eval_loss": 0.4200960695743561,
"eval_runtime": 218.5932,
"eval_samples_per_second": 7.178,
"eval_steps_per_second": 0.901,
"step": 19500
},
{
"epoch": 44.330690826727064,
"grad_norm": 0.8609676957130432,
"learning_rate": 3.0381024476406765e-06,
"loss": 0.1509,
"step": 19550
},
{
"epoch": 44.44394110985277,
"grad_norm": 0.7417690753936768,
"learning_rate": 2.533434266969468e-06,
"loss": 0.1492,
"step": 19600
},
{
"epoch": 44.44394110985277,
"eval_loss": 0.41948238015174866,
"eval_runtime": 218.5587,
"eval_samples_per_second": 7.179,
"eval_steps_per_second": 0.901,
"step": 19600
},
{
"epoch": 44.55719139297848,
"grad_norm": 0.8361729979515076,
"learning_rate": 2.0287660862982593e-06,
"loss": 0.1541,
"step": 19650
},
{
"epoch": 44.67044167610419,
"grad_norm": 0.911729097366333,
"learning_rate": 1.5240979056270503e-06,
"loss": 0.1559,
"step": 19700
},
{
"epoch": 44.67044167610419,
"eval_loss": 0.41870439052581787,
"eval_runtime": 218.5239,
"eval_samples_per_second": 7.18,
"eval_steps_per_second": 0.902,
"step": 19700
},
{
"epoch": 44.7836919592299,
"grad_norm": 0.7706825733184814,
"learning_rate": 1.0194297249558415e-06,
"loss": 0.1554,
"step": 19750
},
{
"epoch": 44.89694224235561,
"grad_norm": 0.9403465986251831,
"learning_rate": 5.147615442846329e-07,
"loss": 0.1549,
"step": 19800
},
{
"epoch": 44.89694224235561,
"eval_loss": 0.4180174469947815,
"eval_runtime": 218.6086,
"eval_samples_per_second": 7.177,
"eval_steps_per_second": 0.901,
"step": 19800
},
{
"epoch": 45.01132502831257,
"grad_norm": 0.9403154253959656,
"learning_rate": 8.532637580325652e-06,
"loss": 0.1533,
"step": 19850
},
{
"epoch": 45.12457531143828,
"grad_norm": 0.8529797196388245,
"learning_rate": 8.049475769435185e-06,
"loss": 0.1507,
"step": 19900
},
{
"epoch": 45.12457531143828,
"eval_loss": 0.41985705494880676,
"eval_runtime": 218.5997,
"eval_samples_per_second": 7.178,
"eval_steps_per_second": 0.901,
"step": 19900
},
{
"epoch": 45.237825594563986,
"grad_norm": 0.877526581287384,
"learning_rate": 7.5663139585447175e-06,
"loss": 0.1498,
"step": 19950
},
{
"epoch": 45.351075877689695,
"grad_norm": 0.9668393731117249,
"learning_rate": 7.0831521476542495e-06,
"loss": 0.152,
"step": 20000
},
{
"epoch": 45.351075877689695,
"eval_loss": 0.41871750354766846,
"eval_runtime": 218.6395,
"eval_samples_per_second": 7.176,
"eval_steps_per_second": 0.901,
"step": 20000
},
{
"epoch": 45.4643261608154,
"grad_norm": 1.0251694917678833,
"learning_rate": 6.599990336763782e-06,
"loss": 0.1529,
"step": 20050
},
{
"epoch": 45.57757644394111,
"grad_norm": 1.4579505920410156,
"learning_rate": 6.116828525873315e-06,
"loss": 0.1571,
"step": 20100
},
{
"epoch": 45.57757644394111,
"eval_loss": 0.4161282181739807,
"eval_runtime": 218.584,
"eval_samples_per_second": 7.178,
"eval_steps_per_second": 0.901,
"step": 20100
},
{
"epoch": 45.69082672706682,
"grad_norm": 0.7462686896324158,
"learning_rate": 5.633666714982848e-06,
"loss": 0.1611,
"step": 20150
},
{
"epoch": 45.80407701019253,
"grad_norm": 0.9031079411506653,
"learning_rate": 5.150504904092381e-06,
"loss": 0.153,
"step": 20200
},
{
"epoch": 45.80407701019253,
"eval_loss": 0.41474393010139465,
"eval_runtime": 218.6388,
"eval_samples_per_second": 7.176,
"eval_steps_per_second": 0.901,
"step": 20200
},
{
"epoch": 45.91732729331823,
"grad_norm": 0.8560954332351685,
"learning_rate": 4.667343093201913e-06,
"loss": 0.1531,
"step": 20250
},
{
"epoch": 46.0317100792752,
"grad_norm": 1.1464442014694214,
"learning_rate": 4.184181282311446e-06,
"loss": 0.1535,
"step": 20300
},
{
"epoch": 46.0317100792752,
"eval_loss": 0.414587140083313,
"eval_runtime": 218.5994,
"eval_samples_per_second": 7.178,
"eval_steps_per_second": 0.901,
"step": 20300
},
{
"epoch": 46.14496036240091,
"grad_norm": 0.8384661674499512,
"learning_rate": 3.7010194714209794e-06,
"loss": 0.1488,
"step": 20350
},
{
"epoch": 46.25821064552662,
"grad_norm": 0.8300140500068665,
"learning_rate": 3.217857660530512e-06,
"loss": 0.1507,
"step": 20400
},
{
"epoch": 46.25821064552662,
"eval_loss": 0.413276344537735,
"eval_runtime": 218.6607,
"eval_samples_per_second": 7.176,
"eval_steps_per_second": 0.901,
"step": 20400
},
{
"epoch": 46.37146092865232,
"grad_norm": 0.7903048396110535,
"learning_rate": 2.7346958496400447e-06,
"loss": 0.148,
"step": 20450
},
{
"epoch": 46.48471121177803,
"grad_norm": 0.888008713722229,
"learning_rate": 2.2515340387495775e-06,
"loss": 0.1447,
"step": 20500
},
{
"epoch": 46.48471121177803,
"eval_loss": 0.4132575988769531,
"eval_runtime": 218.6308,
"eval_samples_per_second": 7.176,
"eval_steps_per_second": 0.901,
"step": 20500
},
{
"epoch": 46.597961494903736,
"grad_norm": 0.975723147392273,
"learning_rate": 1.7683722278591102e-06,
"loss": 0.1448,
"step": 20550
},
{
"epoch": 46.711211778029444,
"grad_norm": 0.7616918087005615,
"learning_rate": 1.2852104169686428e-06,
"loss": 0.1489,
"step": 20600
},
{
"epoch": 46.711211778029444,
"eval_loss": 0.4121854305267334,
"eval_runtime": 218.5727,
"eval_samples_per_second": 7.178,
"eval_steps_per_second": 0.901,
"step": 20600
},
{
"epoch": 46.82446206115515,
"grad_norm": 0.8662727475166321,
"learning_rate": 8.117118422959849e-07,
"loss": 0.1483,
"step": 20650
},
{
"epoch": 46.93771234428086,
"grad_norm": 0.7502096891403198,
"learning_rate": 3.2855003140551773e-07,
"loss": 0.1504,
"step": 20700
},
{
"epoch": 46.93771234428086,
"eval_loss": 0.41195544600486755,
"eval_runtime": 218.7035,
"eval_samples_per_second": 7.174,
"eval_steps_per_second": 0.901,
"step": 20700
},
{
"epoch": 47.052095130237824,
"grad_norm": 0.9510757923126221,
"learning_rate": 1.1871026339691191e-05,
"loss": 0.1467,
"step": 20750
},
{
"epoch": 47.16534541336353,
"grad_norm": 1.0743557214736938,
"learning_rate": 1.1416893732970029e-05,
"loss": 0.1497,
"step": 20800
},
{
"epoch": 47.16534541336353,
"eval_loss": 0.4156714379787445,
"eval_runtime": 217.4784,
"eval_samples_per_second": 7.215,
"eval_steps_per_second": 0.906,
"step": 20800
},
{
"epoch": 47.27859569648924,
"grad_norm": 1.567784070968628,
"learning_rate": 1.0962761126248864e-05,
"loss": 0.1513,
"step": 20850
},
{
"epoch": 47.39184597961495,
"grad_norm": 1.3992472887039185,
"learning_rate": 1.0508628519527702e-05,
"loss": 0.1533,
"step": 20900
},
{
"epoch": 47.39184597961495,
"eval_loss": 0.4152044653892517,
"eval_runtime": 217.5597,
"eval_samples_per_second": 7.212,
"eval_steps_per_second": 0.905,
"step": 20900
},
{
"epoch": 47.50509626274066,
"grad_norm": 1.5980275869369507,
"learning_rate": 1.005449591280654e-05,
"loss": 0.1523,
"step": 20950
},
{
"epoch": 47.618346545866366,
"grad_norm": 1.2810208797454834,
"learning_rate": 9.600363306085377e-06,
"loss": 0.1502,
"step": 21000
},
{
"epoch": 47.618346545866366,
"eval_loss": 0.4143332839012146,
"eval_runtime": 218.7933,
"eval_samples_per_second": 7.171,
"eval_steps_per_second": 0.9,
"step": 21000
},
{
"epoch": 47.731596828992075,
"grad_norm": 1.4590628147125244,
"learning_rate": 9.146230699364216e-06,
"loss": 0.1512,
"step": 21050
},
{
"epoch": 47.84484711211778,
"grad_norm": 1.3043591976165771,
"learning_rate": 8.692098092643053e-06,
"loss": 0.1561,
"step": 21100
},
{
"epoch": 47.84484711211778,
"eval_loss": 0.41214144229888916,
"eval_runtime": 218.7674,
"eval_samples_per_second": 7.172,
"eval_steps_per_second": 0.9,
"step": 21100
},
{
"epoch": 47.958097395243485,
"grad_norm": 0.8709500432014465,
"learning_rate": 8.247048138056313e-06,
"loss": 0.1478,
"step": 21150
},
{
"epoch": 48.072480181200454,
"grad_norm": 1.005632758140564,
"learning_rate": 7.79291553133515e-06,
"loss": 0.1534,
"step": 21200
},
{
"epoch": 48.072480181200454,
"eval_loss": 0.4120267927646637,
"eval_runtime": 218.9152,
"eval_samples_per_second": 7.167,
"eval_steps_per_second": 0.9,
"step": 21200
},
{
"epoch": 48.18573046432616,
"grad_norm": 1.2001721858978271,
"learning_rate": 7.347865576748411e-06,
"loss": 0.1431,
"step": 21250
},
{
"epoch": 48.29898074745187,
"grad_norm": 1.2004830837249756,
"learning_rate": 6.893732970027249e-06,
"loss": 0.1457,
"step": 21300
},
{
"epoch": 48.29898074745187,
"eval_loss": 0.4105300009250641,
"eval_runtime": 218.8468,
"eval_samples_per_second": 7.169,
"eval_steps_per_second": 0.9,
"step": 21300
},
{
"epoch": 48.41223103057758,
"grad_norm": 1.0889978408813477,
"learning_rate": 6.439600363306085e-06,
"loss": 0.1462,
"step": 21350
},
{
"epoch": 48.52548131370328,
"grad_norm": 0.9354040026664734,
"learning_rate": 5.985467756584924e-06,
"loss": 0.1464,
"step": 21400
},
{
"epoch": 48.52548131370328,
"eval_loss": 0.40966492891311646,
"eval_runtime": 218.8783,
"eval_samples_per_second": 7.168,
"eval_steps_per_second": 0.9,
"step": 21400
},
{
"epoch": 48.63873159682899,
"grad_norm": 0.8427848815917969,
"learning_rate": 5.53133514986376e-06,
"loss": 0.146,
"step": 21450
},
{
"epoch": 48.7519818799547,
"grad_norm": 0.9390880465507507,
"learning_rate": 5.077202543142598e-06,
"loss": 0.1462,
"step": 21500
},
{
"epoch": 48.7519818799547,
"eval_loss": 0.40723294019699097,
"eval_runtime": 218.8819,
"eval_samples_per_second": 7.168,
"eval_steps_per_second": 0.9,
"step": 21500
},
{
"epoch": 48.86523216308041,
"grad_norm": 1.0009453296661377,
"learning_rate": 4.623069936421435e-06,
"loss": 0.1442,
"step": 21550
},
{
"epoch": 48.978482446206115,
"grad_norm": 1.11566960811615,
"learning_rate": 4.168937329700273e-06,
"loss": 0.1469,
"step": 21600
},
{
"epoch": 48.978482446206115,
"eval_loss": 0.405407190322876,
"eval_runtime": 218.8588,
"eval_samples_per_second": 7.169,
"eval_steps_per_second": 0.9,
"step": 21600
},
{
"epoch": 49.09286523216308,
"grad_norm": 0.8854078054428101,
"learning_rate": 3.71480472297911e-06,
"loss": 0.1435,
"step": 21650
},
{
"epoch": 49.206115515288786,
"grad_norm": 0.8558112978935242,
"learning_rate": 3.260672116257948e-06,
"loss": 0.1378,
"step": 21700
},
{
"epoch": 49.206115515288786,
"eval_loss": 0.4061279296875,
"eval_runtime": 218.8569,
"eval_samples_per_second": 7.169,
"eval_steps_per_second": 0.9,
"step": 21700
},
{
"epoch": 49.319365798414495,
"grad_norm": 0.7999886870384216,
"learning_rate": 2.806539509536785e-06,
"loss": 0.1417,
"step": 21750
},
{
"epoch": 49.4326160815402,
"grad_norm": 0.948358952999115,
"learning_rate": 2.3524069028156224e-06,
"loss": 0.1415,
"step": 21800
},
{
"epoch": 49.4326160815402,
"eval_loss": 0.40446802973747253,
"eval_runtime": 218.7745,
"eval_samples_per_second": 7.172,
"eval_steps_per_second": 0.9,
"step": 21800
},
{
"epoch": 49.54586636466591,
"grad_norm": 0.7728579640388489,
"learning_rate": 1.8982742960944597e-06,
"loss": 0.1396,
"step": 21850
},
{
"epoch": 49.65911664779162,
"grad_norm": 0.7241719365119934,
"learning_rate": 1.4441416893732972e-06,
"loss": 0.1398,
"step": 21900
},
{
"epoch": 49.65911664779162,
"eval_loss": 0.4039037525653839,
"eval_runtime": 218.8617,
"eval_samples_per_second": 7.169,
"eval_steps_per_second": 0.9,
"step": 21900
},
{
"epoch": 49.77236693091733,
"grad_norm": 0.7789280414581299,
"learning_rate": 9.900090826521344e-07,
"loss": 0.1427,
"step": 21950
},
{
"epoch": 49.88561721404304,
"grad_norm": 0.8703135848045349,
"learning_rate": 5.358764759309719e-07,
"loss": 0.139,
"step": 22000
},
{
"epoch": 49.88561721404304,
"eval_loss": 0.40355798602104187,
"eval_runtime": 218.7677,
"eval_samples_per_second": 7.172,
"eval_steps_per_second": 0.9,
"step": 22000
},
{
"epoch": 49.998867497168746,
"grad_norm": 0.8729577660560608,
"learning_rate": 8.174386920980928e-08,
"loss": 0.1422,
"step": 22050
}
],
"logging_steps": 50,
"max_steps": 22050,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.11661035307008e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}