{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.962000962000962, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.026559076829896673, "learning_rate": 1.9230769230769234e-06, "loss": 0.8553, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.024109469955629563, "learning_rate": 9.615384615384616e-06, "loss": 0.8949, "step": 5 }, { "epoch": 0.01, "grad_norm": 0.035843143099576466, "learning_rate": 1.923076923076923e-05, "loss": 0.8487, "step": 10 }, { "epoch": 0.01, "grad_norm": 0.02927511973728809, "learning_rate": 2.8846153846153845e-05, "loss": 0.8298, "step": 15 }, { "epoch": 0.02, "grad_norm": 0.0482470347406101, "learning_rate": 3.846153846153846e-05, "loss": 0.8369, "step": 20 }, { "epoch": 0.02, "grad_norm": 0.04787111054382548, "learning_rate": 4.8076923076923084e-05, "loss": 0.8625, "step": 25 }, { "epoch": 0.03, "grad_norm": 0.09068040679248557, "learning_rate": 5.769230769230769e-05, "loss": 0.8133, "step": 30 }, { "epoch": 0.03, "grad_norm": 0.07412802798136442, "learning_rate": 6.730769230769232e-05, "loss": 0.8543, "step": 35 }, { "epoch": 0.04, "grad_norm": 0.0699997088327299, "learning_rate": 7.692307692307693e-05, "loss": 0.763, "step": 40 }, { "epoch": 0.04, "grad_norm": 0.05170267487139005, "learning_rate": 8.653846153846155e-05, "loss": 0.7827, "step": 45 }, { "epoch": 0.05, "grad_norm": 0.07357073672675946, "learning_rate": 9.615384615384617e-05, "loss": 0.8022, "step": 50 }, { "epoch": 0.05, "grad_norm": 0.05641137867224088, "learning_rate": 0.00010576923076923077, "loss": 0.7397, "step": 55 }, { "epoch": 0.06, "grad_norm": 0.057002882272420445, "learning_rate": 0.00011538461538461538, "loss": 0.777, "step": 60 }, { "epoch": 0.06, "grad_norm": 0.050844397685080686, "learning_rate": 0.000125, "loss": 0.7664, "step": 65 }, { "epoch": 0.07, "grad_norm": 0.05336016241159068, "learning_rate": 0.00013461538461538464, "loss": 0.7956, "step": 70 }, { "epoch": 0.07, "grad_norm": 0.05359842093910465, "learning_rate": 0.00014423076923076924, "loss": 0.751, "step": 75 }, { "epoch": 0.08, "grad_norm": 0.043971570409194735, "learning_rate": 0.00015384615384615385, "loss": 0.7823, "step": 80 }, { "epoch": 0.08, "grad_norm": 0.044563802592992065, "learning_rate": 0.00016346153846153846, "loss": 0.7908, "step": 85 }, { "epoch": 0.09, "grad_norm": 0.04801183812641932, "learning_rate": 0.0001730769230769231, "loss": 0.827, "step": 90 }, { "epoch": 0.09, "grad_norm": 0.0522380719803802, "learning_rate": 0.0001826923076923077, "loss": 0.8225, "step": 95 }, { "epoch": 0.1, "grad_norm": 0.048043288977650755, "learning_rate": 0.00019230769230769233, "loss": 0.7715, "step": 100 }, { "epoch": 0.1, "grad_norm": 0.04594631744893944, "learning_rate": 0.00019999943552317104, "loss": 0.7789, "step": 105 }, { "epoch": 0.11, "grad_norm": 0.050805778293244806, "learning_rate": 0.00019997967950328128, "loss": 0.8401, "step": 110 }, { "epoch": 0.11, "grad_norm": 0.04435482092479941, "learning_rate": 0.0001999317060143023, "loss": 0.7773, "step": 115 }, { "epoch": 0.12, "grad_norm": 0.05022795262704976, "learning_rate": 0.0001998555285958899, "loss": 0.7271, "step": 120 }, { "epoch": 0.12, "grad_norm": 0.05316130307831719, "learning_rate": 0.00019975116874775242, "loss": 0.8088, "step": 125 }, { "epoch": 0.13, "grad_norm": 0.04630310630357938, "learning_rate": 0.00019961865592358288, "loss": 0.7752, "step": 130 }, { "epoch": 0.13, "grad_norm": 0.05955477676937173, "learning_rate": 0.0001994580275227462, "loss": 0.7639, "step": 135 }, { "epoch": 0.13, "grad_norm": 0.04911523657827075, "learning_rate": 0.00019926932887972393, "loss": 0.7476, "step": 140 }, { "epoch": 0.14, "grad_norm": 0.04365854699386244, "learning_rate": 0.0001990526132513194, "loss": 0.7671, "step": 145 }, { "epoch": 0.14, "grad_norm": 0.0479590041051757, "learning_rate": 0.00019880794180162693, "loss": 0.8015, "step": 150 }, { "epoch": 0.15, "grad_norm": 0.053032592897760425, "learning_rate": 0.00019853538358476932, "loss": 0.8114, "step": 155 }, { "epoch": 0.15, "grad_norm": 0.04823282437342225, "learning_rate": 0.00019823501552540865, "loss": 0.7843, "step": 160 }, { "epoch": 0.16, "grad_norm": 0.13942036529350957, "learning_rate": 0.00019790692239703557, "loss": 0.7066, "step": 165 }, { "epoch": 0.16, "grad_norm": 0.05813669118246924, "learning_rate": 0.00019755119679804367, "loss": 0.7782, "step": 170 }, { "epoch": 0.17, "grad_norm": 0.045133845682998344, "learning_rate": 0.00019716793912559507, "loss": 0.8228, "step": 175 }, { "epoch": 0.17, "grad_norm": 0.04767176858103638, "learning_rate": 0.00019675725754728527, "loss": 0.8016, "step": 180 }, { "epoch": 0.18, "grad_norm": 0.05800072730359953, "learning_rate": 0.00019631926797061456, "loss": 0.7576, "step": 185 }, { "epoch": 0.18, "grad_norm": 0.044752361745923355, "learning_rate": 0.00019585409401027556, "loss": 0.7311, "step": 190 }, { "epoch": 0.19, "grad_norm": 0.0436163176892526, "learning_rate": 0.00019536186695326486, "loss": 0.7584, "step": 195 }, { "epoch": 0.19, "grad_norm": 0.07116796220713574, "learning_rate": 0.00019484272572182986, "loss": 0.7525, "step": 200 }, { "epoch": 0.2, "grad_norm": 0.051057018027582515, "learning_rate": 0.00019429681683426022, "loss": 0.798, "step": 205 }, { "epoch": 0.2, "grad_norm": 0.06621856781536847, "learning_rate": 0.00019372429436353606, "loss": 0.7242, "step": 210 }, { "epoch": 0.21, "grad_norm": 0.05280223873501623, "learning_rate": 0.0001931253198938432, "loss": 0.8013, "step": 215 }, { "epoch": 0.21, "grad_norm": 0.049913511617762696, "learning_rate": 0.00019250006247496928, "loss": 0.7282, "step": 220 }, { "epoch": 0.22, "grad_norm": 0.057224419119031235, "learning_rate": 0.00019184869857459232, "loss": 0.7986, "step": 225 }, { "epoch": 0.22, "grad_norm": 0.062107090513119266, "learning_rate": 0.00019117141202847586, "loss": 0.7305, "step": 230 }, { "epoch": 0.23, "grad_norm": 0.06236876490128247, "learning_rate": 0.00019046839398858474, "loss": 0.7961, "step": 235 }, { "epoch": 0.23, "grad_norm": 0.04936400763486868, "learning_rate": 0.00018973984286913584, "loss": 0.735, "step": 240 }, { "epoch": 0.24, "grad_norm": 0.053408247386716914, "learning_rate": 0.0001889859642905992, "loss": 0.7857, "step": 245 }, { "epoch": 0.24, "grad_norm": 0.05781811933339179, "learning_rate": 0.00018820697102166526, "loss": 0.7627, "step": 250 }, { "epoch": 0.25, "grad_norm": 0.05674627874967702, "learning_rate": 0.00018740308291919497, "loss": 0.7492, "step": 255 }, { "epoch": 0.25, "grad_norm": 0.05113375946042358, "learning_rate": 0.0001865745268661689, "loss": 0.8117, "step": 260 }, { "epoch": 0.25, "grad_norm": 0.06186000436202041, "learning_rate": 0.00018572153670765365, "loss": 0.801, "step": 265 }, { "epoch": 0.26, "grad_norm": 0.07562426320620609, "learning_rate": 0.00018484435318480332, "loss": 0.8071, "step": 270 }, { "epoch": 0.26, "grad_norm": 0.06311677584919179, "learning_rate": 0.0001839432238669147, "loss": 0.7843, "step": 275 }, { "epoch": 0.27, "grad_norm": 0.06630091865014885, "learning_rate": 0.00018301840308155507, "loss": 0.7493, "step": 280 }, { "epoch": 0.27, "grad_norm": 0.06544583135680962, "learning_rate": 0.00018207015184278305, "loss": 0.782, "step": 285 }, { "epoch": 0.28, "grad_norm": 0.06541978841910906, "learning_rate": 0.000181098737777482, "loss": 0.766, "step": 290 }, { "epoch": 0.28, "grad_norm": 0.05791188755333043, "learning_rate": 0.00018010443504982694, "loss": 0.7499, "step": 295 }, { "epoch": 0.29, "grad_norm": 0.06550833011506903, "learning_rate": 0.000179087524283907, "loss": 0.8137, "step": 300 }, { "epoch": 0.29, "grad_norm": 0.058622811505104906, "learning_rate": 0.00017804829248452395, "loss": 0.7512, "step": 305 }, { "epoch": 0.3, "grad_norm": 0.0646444641030475, "learning_rate": 0.00017698703295619052, "loss": 0.7908, "step": 310 }, { "epoch": 0.3, "grad_norm": 0.06559061358782307, "learning_rate": 0.00017590404522035028, "loss": 0.7308, "step": 315 }, { "epoch": 0.31, "grad_norm": 0.06308845309249086, "learning_rate": 0.00017479963493084329, "loss": 0.7643, "step": 320 }, { "epoch": 0.31, "grad_norm": 0.06406066039467145, "learning_rate": 0.0001736741137876405, "loss": 0.7775, "step": 325 }, { "epoch": 0.32, "grad_norm": 0.06444860497739553, "learning_rate": 0.00017252779944887235, "loss": 0.7774, "step": 330 }, { "epoch": 0.32, "grad_norm": 0.06604642339408012, "learning_rate": 0.00017136101544117525, "loss": 0.7362, "step": 335 }, { "epoch": 0.33, "grad_norm": 0.06850287335544007, "learning_rate": 0.00017017409106838207, "loss": 0.7501, "step": 340 }, { "epoch": 0.33, "grad_norm": 0.06459535279086648, "learning_rate": 0.00016896736131858208, "loss": 0.7606, "step": 345 }, { "epoch": 0.34, "grad_norm": 0.06245922169687922, "learning_rate": 0.0001677411667695765, "loss": 0.7459, "step": 350 }, { "epoch": 0.34, "grad_norm": 0.0618217626323294, "learning_rate": 0.00016649585349275662, "loss": 0.7608, "step": 355 }, { "epoch": 0.35, "grad_norm": 0.06529458738837608, "learning_rate": 0.0001652317729554313, "loss": 0.7793, "step": 360 }, { "epoch": 0.35, "grad_norm": 0.07281345190056655, "learning_rate": 0.0001639492819216316, "loss": 0.7769, "step": 365 }, { "epoch": 0.36, "grad_norm": 0.07253822403948054, "learning_rate": 0.0001626487423514207, "loss": 0.7699, "step": 370 }, { "epoch": 0.36, "grad_norm": 0.059915171172645505, "learning_rate": 0.00016133052129873693, "loss": 0.7426, "step": 375 }, { "epoch": 0.37, "grad_norm": 0.06063815927279327, "learning_rate": 0.0001599949908077996, "loss": 0.7859, "step": 380 }, { "epoch": 0.37, "grad_norm": 0.07982151033712452, "learning_rate": 0.00015864252780810616, "loss": 0.7484, "step": 385 }, { "epoch": 0.38, "grad_norm": 0.07807371830206032, "learning_rate": 0.00015727351400805052, "loss": 0.7318, "step": 390 }, { "epoch": 0.38, "grad_norm": 0.06359423217230728, "learning_rate": 0.0001558883357871928, "loss": 0.7707, "step": 395 }, { "epoch": 0.38, "grad_norm": 0.09384048396706658, "learning_rate": 0.00015448738408721052, "loss": 0.7869, "step": 400 }, { "epoch": 0.39, "grad_norm": 0.07059181179022768, "learning_rate": 0.00015307105430156255, "loss": 0.7139, "step": 405 }, { "epoch": 0.39, "grad_norm": 0.07407320984033573, "learning_rate": 0.0001516397461638962, "loss": 0.7476, "step": 410 }, { "epoch": 0.4, "grad_norm": 0.07240911601504253, "learning_rate": 0.0001501938636352297, "loss": 0.7655, "step": 415 }, { "epoch": 0.4, "grad_norm": 0.07068099666527985, "learning_rate": 0.00014873381478994134, "loss": 0.7893, "step": 420 }, { "epoch": 0.41, "grad_norm": 0.08305359605908401, "learning_rate": 0.00014726001170059792, "loss": 0.7111, "step": 425 }, { "epoch": 0.41, "grad_norm": 0.0670085750282625, "learning_rate": 0.00014577287032165468, "loss": 0.7527, "step": 430 }, { "epoch": 0.42, "grad_norm": 0.06789518226534799, "learning_rate": 0.00014427281037205945, "loss": 0.7751, "step": 435 }, { "epoch": 0.42, "grad_norm": 0.09136108168814464, "learning_rate": 0.00014276025521679471, "loss": 0.726, "step": 440 }, { "epoch": 0.43, "grad_norm": 0.06956344474331025, "learning_rate": 0.00014123563174739037, "loss": 0.8187, "step": 445 }, { "epoch": 0.43, "grad_norm": 0.07271098739476833, "learning_rate": 0.00013969937026144118, "loss": 0.7787, "step": 450 }, { "epoch": 0.44, "grad_norm": 0.06984149301611005, "learning_rate": 0.00013815190434116317, "loss": 0.7873, "step": 455 }, { "epoch": 0.44, "grad_norm": 0.07335035453389814, "learning_rate": 0.00013659367073102268, "loss": 0.7609, "step": 460 }, { "epoch": 0.45, "grad_norm": 0.0744105188121296, "learning_rate": 0.00013502510921447323, "loss": 0.7169, "step": 465 }, { "epoch": 0.45, "grad_norm": 0.06947572989302908, "learning_rate": 0.00013344666248983432, "loss": 0.7837, "step": 470 }, { "epoch": 0.46, "grad_norm": 0.07999096873197906, "learning_rate": 0.000131858776045348, "loss": 0.7727, "step": 475 }, { "epoch": 0.46, "grad_norm": 0.07774452098436961, "learning_rate": 0.00013026189803344774, "loss": 0.8242, "step": 480 }, { "epoch": 0.47, "grad_norm": 0.07801636818427328, "learning_rate": 0.00012865647914427544, "loss": 0.7269, "step": 485 }, { "epoch": 0.47, "grad_norm": 0.11086472292549841, "learning_rate": 0.00012704297247848216, "loss": 0.7503, "step": 490 }, { "epoch": 0.48, "grad_norm": 0.0724703886664607, "learning_rate": 0.00012542183341934872, "loss": 0.81, "step": 495 }, { "epoch": 0.48, "grad_norm": 0.08424198871373768, "learning_rate": 0.00012379351950426187, "loss": 0.7102, "step": 500 }, { "epoch": 0.49, "grad_norm": 0.07399506224237913, "learning_rate": 0.0001221584902955827, "loss": 0.811, "step": 505 }, { "epoch": 0.49, "grad_norm": 0.07513530266538687, "learning_rate": 0.00012051720725094324, "loss": 0.7328, "step": 510 }, { "epoch": 0.5, "grad_norm": 0.08300665590930759, "learning_rate": 0.00011887013359300837, "loss": 0.7728, "step": 515 }, { "epoch": 0.5, "grad_norm": 0.08603160091015216, "learning_rate": 0.00011721773417873965, "loss": 0.8092, "step": 520 }, { "epoch": 0.51, "grad_norm": 0.06828684892985817, "learning_rate": 0.00011556047536819777, "loss": 0.7905, "step": 525 }, { "epoch": 0.51, "grad_norm": 0.0888375409363043, "learning_rate": 0.00011389882489292061, "loss": 0.7616, "step": 530 }, { "epoch": 0.51, "grad_norm": 0.07153992771335722, "learning_rate": 0.0001122332517239147, "loss": 0.7231, "step": 535 }, { "epoch": 0.52, "grad_norm": 0.07078385208785225, "learning_rate": 0.00011056422593929635, "loss": 0.7744, "step": 540 }, { "epoch": 0.52, "grad_norm": 0.06724179763601197, "learning_rate": 0.00010889221859162062, "loss": 0.7385, "step": 545 }, { "epoch": 0.53, "grad_norm": 0.08127731002438206, "learning_rate": 0.00010721770157493527, "loss": 0.737, "step": 550 }, { "epoch": 0.53, "grad_norm": 0.07971841929977522, "learning_rate": 0.000105541147491597, "loss": 0.7129, "step": 555 }, { "epoch": 0.54, "grad_norm": 0.09905140750790473, "learning_rate": 0.00010386302951888804, "loss": 0.7682, "step": 560 }, { "epoch": 0.54, "grad_norm": 0.07787324370915785, "learning_rate": 0.00010218382127547022, "loss": 0.7988, "step": 565 }, { "epoch": 0.55, "grad_norm": 0.06979898484314451, "learning_rate": 0.00010050399668771479, "loss": 0.7505, "step": 570 }, { "epoch": 0.55, "grad_norm": 0.08294483662862129, "learning_rate": 9.882402985594515e-05, "loss": 0.7254, "step": 575 }, { "epoch": 0.56, "grad_norm": 0.09404505448514887, "learning_rate": 9.71443949206304e-05, "loss": 0.7744, "step": 580 }, { "epoch": 0.56, "grad_norm": 0.08296480696219607, "learning_rate": 9.546556592856789e-05, "loss": 0.7255, "step": 585 }, { "epoch": 0.57, "grad_norm": 0.08380203772972816, "learning_rate": 9.378801669909197e-05, "loss": 0.6704, "step": 590 }, { "epoch": 0.57, "grad_norm": 0.09814150545542286, "learning_rate": 9.211222069034695e-05, "loss": 0.7107, "step": 595 }, { "epoch": 0.58, "grad_norm": 0.0951778654867295, "learning_rate": 9.043865086566214e-05, "loss": 0.7158, "step": 600 }, { "epoch": 0.58, "grad_norm": 0.08511949253631121, "learning_rate": 8.87677795600663e-05, "loss": 0.7572, "step": 605 }, { "epoch": 0.59, "grad_norm": 0.1143213911299264, "learning_rate": 8.710007834697969e-05, "loss": 0.7785, "step": 610 }, { "epoch": 0.59, "grad_norm": 0.08008142575287244, "learning_rate": 8.543601790512083e-05, "loss": 0.7327, "step": 615 }, { "epoch": 0.6, "grad_norm": 0.09526058947958355, "learning_rate": 8.377606788566597e-05, "loss": 0.703, "step": 620 }, { "epoch": 0.6, "grad_norm": 0.08273005979279956, "learning_rate": 8.212069677969851e-05, "loss": 0.7497, "step": 625 }, { "epoch": 0.61, "grad_norm": 0.08864939979402765, "learning_rate": 8.047037178598567e-05, "loss": 0.7573, "step": 630 }, { "epoch": 0.61, "grad_norm": 0.08394557070047488, "learning_rate": 7.882555867912017e-05, "loss": 0.7827, "step": 635 }, { "epoch": 0.62, "grad_norm": 0.09978852942456092, "learning_rate": 7.718672167806354e-05, "loss": 0.7201, "step": 640 }, { "epoch": 0.62, "grad_norm": 0.07987549874350373, "learning_rate": 7.55543233151289e-05, "loss": 0.7129, "step": 645 }, { "epoch": 0.63, "grad_norm": 0.09662339979538469, "learning_rate": 7.392882430543928e-05, "loss": 0.7593, "step": 650 }, { "epoch": 0.63, "grad_norm": 0.0966150953702372, "learning_rate": 7.231068341689923e-05, "loss": 0.6704, "step": 655 }, { "epoch": 0.63, "grad_norm": 0.08961534426031717, "learning_rate": 7.070035734071574e-05, "loss": 0.781, "step": 660 }, { "epoch": 0.64, "grad_norm": 0.09684345543910731, "learning_rate": 6.909830056250527e-05, "loss": 0.7787, "step": 665 }, { "epoch": 0.64, "grad_norm": 0.10518057650982461, "learning_rate": 6.750496523402352e-05, "loss": 0.7658, "step": 670 }, { "epoch": 0.65, "grad_norm": 0.08726496496105966, "learning_rate": 6.592080104555357e-05, "loss": 0.7515, "step": 675 }, { "epoch": 0.65, "grad_norm": 0.0933748204446253, "learning_rate": 6.434625509898897e-05, "loss": 0.7474, "step": 680 }, { "epoch": 0.66, "grad_norm": 0.09987049741300834, "learning_rate": 6.278177178164721e-05, "loss": 0.7458, "step": 685 }, { "epoch": 0.66, "grad_norm": 0.10265280285088377, "learning_rate": 6.122779264084932e-05, "loss": 0.7194, "step": 690 }, { "epoch": 0.67, "grad_norm": 0.09519605624873385, "learning_rate": 5.968475625930124e-05, "loss": 0.7788, "step": 695 }, { "epoch": 0.67, "grad_norm": 0.10432725726790697, "learning_rate": 5.815309813131153e-05, "loss": 0.6987, "step": 700 }, { "epoch": 0.68, "grad_norm": 0.0895838664526722, "learning_rate": 5.663325053988112e-05, "loss": 0.7438, "step": 705 }, { "epoch": 0.68, "grad_norm": 0.1020683075125265, "learning_rate": 5.5125642434699044e-05, "loss": 0.7329, "step": 710 }, { "epoch": 0.69, "grad_norm": 0.1254242358800245, "learning_rate": 5.363069931107902e-05, "loss": 0.7701, "step": 715 }, { "epoch": 0.69, "grad_norm": 0.09171973242978361, "learning_rate": 5.214884308987136e-05, "loss": 0.7614, "step": 720 }, { "epoch": 0.7, "grad_norm": 0.0973938129022939, "learning_rate": 5.068049199838307e-05, "loss": 0.7654, "step": 725 }, { "epoch": 0.7, "grad_norm": 0.10651861956923717, "learning_rate": 4.9226060452340825e-05, "loss": 0.7459, "step": 730 }, { "epoch": 0.71, "grad_norm": 0.10089952208386434, "learning_rate": 4.7785958938929644e-05, "loss": 0.7259, "step": 735 }, { "epoch": 0.71, "grad_norm": 0.1210514333134373, "learning_rate": 4.6360593900940074e-05, "loss": 0.7434, "step": 740 }, { "epoch": 0.72, "grad_norm": 0.09318485524858554, "learning_rate": 4.4950367622057173e-05, "loss": 0.7452, "step": 745 }, { "epoch": 0.72, "grad_norm": 0.0982653281602369, "learning_rate": 4.355567811332311e-05, "loss": 0.7647, "step": 750 }, { "epoch": 0.73, "grad_norm": 0.10433832624807006, "learning_rate": 4.21769190008056e-05, "loss": 0.7786, "step": 755 }, { "epoch": 0.73, "grad_norm": 0.10945535727159489, "learning_rate": 4.081447941450428e-05, "loss": 0.7534, "step": 760 }, { "epoch": 0.74, "grad_norm": 0.08979908942148239, "learning_rate": 3.946874387852545e-05, "loss": 0.7684, "step": 765 }, { "epoch": 0.74, "grad_norm": 0.10406027811162083, "learning_rate": 3.8140092202557185e-05, "loss": 0.722, "step": 770 }, { "epoch": 0.75, "grad_norm": 0.1035268479061034, "learning_rate": 3.682889937467493e-05, "loss": 0.7553, "step": 775 }, { "epoch": 0.75, "grad_norm": 0.10887919806436043, "learning_rate": 3.553553545550768e-05, "loss": 0.7246, "step": 780 }, { "epoch": 0.76, "grad_norm": 0.09604076773803726, "learning_rate": 3.426036547379528e-05, "loss": 0.7608, "step": 785 }, { "epoch": 0.76, "grad_norm": 0.10956385216460547, "learning_rate": 3.300374932336533e-05, "loss": 0.7338, "step": 790 }, { "epoch": 0.76, "grad_norm": 0.10172528555623694, "learning_rate": 3.176604166155976e-05, "loss": 0.7495, "step": 795 }, { "epoch": 0.77, "grad_norm": 0.10744826094269415, "learning_rate": 3.054759180913921e-05, "loss": 0.8015, "step": 800 }, { "epoch": 0.77, "grad_norm": 0.09531428953628962, "learning_rate": 2.9348743651693357e-05, "loss": 0.7432, "step": 805 }, { "epoch": 0.78, "grad_norm": 0.09578814140538712, "learning_rate": 2.8169835542585587e-05, "loss": 0.6876, "step": 810 }, { "epoch": 0.78, "grad_norm": 0.10906430952489729, "learning_rate": 2.7011200207458677e-05, "loss": 0.7461, "step": 815 }, { "epoch": 0.79, "grad_norm": 0.09924057812689555, "learning_rate": 2.5873164650328996e-05, "loss": 0.7403, "step": 820 }, { "epoch": 0.79, "grad_norm": 0.102534066373728, "learning_rate": 2.4756050061295534e-05, "loss": 0.7771, "step": 825 }, { "epoch": 0.8, "grad_norm": 0.11379854278214475, "learning_rate": 2.36601717258897e-05, "loss": 0.7494, "step": 830 }, { "epoch": 0.8, "grad_norm": 0.09555943569834097, "learning_rate": 2.2585838936091754e-05, "loss": 0.7062, "step": 835 }, { "epoch": 0.81, "grad_norm": 0.08701341120074446, "learning_rate": 2.153335490303856e-05, "loss": 0.7029, "step": 840 }, { "epoch": 0.81, "grad_norm": 0.10955955931583401, "learning_rate": 2.0503016671447785e-05, "loss": 0.7119, "step": 845 }, { "epoch": 0.82, "grad_norm": 0.09961678142736638, "learning_rate": 1.9495115035782307e-05, "loss": 0.7181, "step": 850 }, { "epoch": 0.82, "grad_norm": 0.10875433120436596, "learning_rate": 1.8509934458178712e-05, "loss": 0.7221, "step": 855 }, { "epoch": 0.83, "grad_norm": 0.10419310033343766, "learning_rate": 1.754775298816307e-05, "loss": 0.7627, "step": 860 }, { "epoch": 0.83, "grad_norm": 0.11125643674385945, "learning_rate": 1.6608842184176243e-05, "loss": 0.783, "step": 865 }, { "epoch": 0.84, "grad_norm": 0.09805407303806633, "learning_rate": 1.5693467036931576e-05, "loss": 0.7754, "step": 870 }, { "epoch": 0.84, "grad_norm": 0.09382093887295272, "learning_rate": 1.48018858946259e-05, "loss": 0.7306, "step": 875 }, { "epoch": 0.85, "grad_norm": 0.08828049016039903, "learning_rate": 1.3934350390025463e-05, "loss": 0.7277, "step": 880 }, { "epoch": 0.85, "grad_norm": 0.10111049793555948, "learning_rate": 1.3091105369447165e-05, "loss": 0.7433, "step": 885 }, { "epoch": 0.86, "grad_norm": 0.09959284476126003, "learning_rate": 1.22723888236549e-05, "loss": 0.7608, "step": 890 }, { "epoch": 0.86, "grad_norm": 0.12000298215227106, "learning_rate": 1.1478431820691083e-05, "loss": 0.7249, "step": 895 }, { "epoch": 0.87, "grad_norm": 0.09846825711770221, "learning_rate": 1.0709458440661801e-05, "loss": 0.7474, "step": 900 }, { "epoch": 0.87, "grad_norm": 0.08826572755281627, "learning_rate": 9.965685712494199e-06, "loss": 0.7125, "step": 905 }, { "epoch": 0.88, "grad_norm": 0.12402329261776916, "learning_rate": 9.247323552684051e-06, "loss": 0.7685, "step": 910 }, { "epoch": 0.88, "grad_norm": 0.10462400264867344, "learning_rate": 8.554574706050488e-06, "loss": 0.7884, "step": 915 }, { "epoch": 0.89, "grad_norm": 0.11275389812311376, "learning_rate": 7.887634688515e-06, "loss": 0.7487, "step": 920 }, { "epoch": 0.89, "grad_norm": 0.11871268386808256, "learning_rate": 7.246691731920485e-06, "loss": 0.7607, "step": 925 }, { "epoch": 0.89, "grad_norm": 0.12345475324952107, "learning_rate": 6.631926730906324e-06, "loss": 0.7716, "step": 930 }, { "epoch": 0.9, "grad_norm": 0.10653833929745236, "learning_rate": 6.043513191853978e-06, "loss": 0.7465, "step": 935 }, { "epoch": 0.9, "grad_norm": 0.09013425987778603, "learning_rate": 5.481617183918053e-06, "loss": 0.7543, "step": 940 }, { "epoch": 0.91, "grad_norm": 0.09465293194517142, "learning_rate": 4.946397292156158e-06, "loss": 0.736, "step": 945 }, { "epoch": 0.91, "grad_norm": 0.10787353868301028, "learning_rate": 4.438004572771182e-06, "loss": 0.7284, "step": 950 }, { "epoch": 0.92, "grad_norm": 0.1429868308987827, "learning_rate": 3.9565825104783685e-06, "loss": 0.7907, "step": 955 }, { "epoch": 0.92, "grad_norm": 0.10249198050493027, "learning_rate": 3.5022669780093497e-06, "loss": 0.7203, "step": 960 }, { "epoch": 0.93, "grad_norm": 0.1028361192764587, "learning_rate": 3.0751861977645125e-06, "loss": 0.7284, "step": 965 }, { "epoch": 0.93, "grad_norm": 0.1278500182671187, "learning_rate": 2.6754607056244883e-06, "loss": 0.7447, "step": 970 }, { "epoch": 0.94, "grad_norm": 0.09199143538365835, "learning_rate": 2.303203316931102e-06, "loss": 0.7173, "step": 975 }, { "epoch": 0.94, "grad_norm": 0.10797760289216082, "learning_rate": 1.9585190946472488e-06, "loss": 0.7163, "step": 980 }, { "epoch": 0.95, "grad_norm": 0.09281102451549036, "learning_rate": 1.6415053197047725e-06, "loss": 0.7284, "step": 985 }, { "epoch": 0.95, "grad_norm": 0.11431066643386725, "learning_rate": 1.3522514635486816e-06, "loss": 0.7723, "step": 990 }, { "epoch": 0.96, "grad_norm": 0.09277614207193054, "learning_rate": 1.0908391628854041e-06, "loss": 0.7623, "step": 995 }, { "epoch": 0.96, "grad_norm": 0.0970512860952287, "learning_rate": 8.57342196642319e-07, "loss": 0.6736, "step": 1000 }, { "epoch": 0.96, "step": 1000, "total_flos": 3.3476112847732736e+16, "train_loss": 0.0, "train_runtime": 0.0161, "train_samples_per_second": 1676083.975, "train_steps_per_second": 52354.363 } ], "logging_steps": 5, "max_steps": 844, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 3.3476112847732736e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }