{ "best_metric": 2.3844265937805176, "best_model_checkpoint": "ckpt/llama2_13b_fuze15_no_sys/strategyqa_no_sys/checkpoint-500", "epoch": 10.0, "eval_steps": 50, "global_step": 650, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15, "grad_norm": 0.40633881092071533, "learning_rate": 5e-05, "loss": 1.8151, "step": 10 }, { "epoch": 0.31, "grad_norm": 0.5893375873565674, "learning_rate": 0.0001, "loss": 1.5545, "step": 20 }, { "epoch": 0.46, "grad_norm": 0.4787861406803131, "learning_rate": 9.993784606094612e-05, "loss": 1.2226, "step": 30 }, { "epoch": 0.62, "grad_norm": 0.5218197703361511, "learning_rate": 9.975153876827008e-05, "loss": 1.1648, "step": 40 }, { "epoch": 0.77, "grad_norm": 0.4085441827774048, "learning_rate": 9.944154131125642e-05, "loss": 1.1108, "step": 50 }, { "epoch": 0.77, "eval_loss": 1.1378295421600342, "eval_runtime": 1.8486, "eval_samples_per_second": 148.764, "eval_steps_per_second": 6.492, "step": 50 }, { "epoch": 0.92, "grad_norm": 0.38595476746559143, "learning_rate": 9.900862439242719e-05, "loss": 1.1433, "step": 60 }, { "epoch": 1.08, "grad_norm": 0.38242650032043457, "learning_rate": 9.84538643114539e-05, "loss": 1.1424, "step": 70 }, { "epoch": 1.23, "grad_norm": 0.36820024251937866, "learning_rate": 9.777864028930705e-05, "loss": 1.092, "step": 80 }, { "epoch": 1.38, "grad_norm": 0.40036314725875854, "learning_rate": 9.698463103929542e-05, "loss": 1.0537, "step": 90 }, { "epoch": 1.54, "grad_norm": 0.45688629150390625, "learning_rate": 9.607381059352038e-05, "loss": 1.0652, "step": 100 }, { "epoch": 1.54, "eval_loss": 1.1235716342926025, "eval_runtime": 1.8515, "eval_samples_per_second": 148.526, "eval_steps_per_second": 6.481, "step": 100 }, { "epoch": 1.69, "grad_norm": 0.4719078540802002, "learning_rate": 9.504844339512095e-05, "loss": 1.0962, "step": 110 }, { "epoch": 1.85, "grad_norm": 0.5598475933074951, "learning_rate": 9.391107866851143e-05, "loss": 1.067, "step": 120 }, { "epoch": 2.0, "grad_norm": 0.5334481000900269, "learning_rate": 9.266454408160779e-05, "loss": 1.0723, "step": 130 }, { "epoch": 2.15, "grad_norm": 0.5928361415863037, "learning_rate": 9.131193871579975e-05, "loss": 0.9459, "step": 140 }, { "epoch": 2.31, "grad_norm": 0.8523222804069519, "learning_rate": 8.985662536114613e-05, "loss": 0.9679, "step": 150 }, { "epoch": 2.31, "eval_loss": 1.1641343832015991, "eval_runtime": 1.8561, "eval_samples_per_second": 148.157, "eval_steps_per_second": 6.465, "step": 150 }, { "epoch": 2.46, "grad_norm": 0.9512267112731934, "learning_rate": 8.83022221559489e-05, "loss": 0.8452, "step": 160 }, { "epoch": 2.62, "grad_norm": 0.8635448217391968, "learning_rate": 8.665259359149132e-05, "loss": 0.8922, "step": 170 }, { "epoch": 2.77, "grad_norm": 1.3180458545684814, "learning_rate": 8.491184090430364e-05, "loss": 0.8441, "step": 180 }, { "epoch": 2.92, "grad_norm": 1.2508196830749512, "learning_rate": 8.308429187984297e-05, "loss": 0.9151, "step": 190 }, { "epoch": 3.08, "grad_norm": 1.8130565881729126, "learning_rate": 8.117449009293668e-05, "loss": 0.7321, "step": 200 }, { "epoch": 3.08, "eval_loss": 1.3882437944412231, "eval_runtime": 1.8575, "eval_samples_per_second": 148.047, "eval_steps_per_second": 6.46, "step": 200 }, { "epoch": 3.23, "grad_norm": 1.3749483823776245, "learning_rate": 7.91871836117395e-05, "loss": 0.5473, "step": 210 }, { "epoch": 3.38, "grad_norm": 2.105992078781128, "learning_rate": 7.712731319328798e-05, "loss": 0.549, "step": 220 }, { "epoch": 3.54, "grad_norm": 1.8341376781463623, "learning_rate": 7.500000000000001e-05, "loss": 0.5206, "step": 230 }, { "epoch": 3.69, "grad_norm": 1.7375411987304688, "learning_rate": 7.281053286765815e-05, "loss": 0.5435, "step": 240 }, { "epoch": 3.85, "grad_norm": 1.8721497058868408, "learning_rate": 7.056435515653059e-05, "loss": 0.5562, "step": 250 }, { "epoch": 3.85, "eval_loss": 1.3442635536193848, "eval_runtime": 1.8579, "eval_samples_per_second": 148.019, "eval_steps_per_second": 6.459, "step": 250 }, { "epoch": 4.0, "grad_norm": 1.7349268198013306, "learning_rate": 6.826705121831976e-05, "loss": 0.5269, "step": 260 }, { "epoch": 4.15, "grad_norm": 2.820291757583618, "learning_rate": 6.592433251258423e-05, "loss": 0.3063, "step": 270 }, { "epoch": 4.31, "grad_norm": 1.9900763034820557, "learning_rate": 6.354202340715026e-05, "loss": 0.2743, "step": 280 }, { "epoch": 4.46, "grad_norm": 2.0194549560546875, "learning_rate": 6.112604669781572e-05, "loss": 0.2718, "step": 290 }, { "epoch": 4.62, "grad_norm": 2.1579859256744385, "learning_rate": 5.868240888334653e-05, "loss": 0.2699, "step": 300 }, { "epoch": 4.62, "eval_loss": 1.6108604669570923, "eval_runtime": 1.858, "eval_samples_per_second": 148.007, "eval_steps_per_second": 6.458, "step": 300 }, { "epoch": 4.77, "grad_norm": 1.8816920518875122, "learning_rate": 5.621718523237427e-05, "loss": 0.2749, "step": 310 }, { "epoch": 4.92, "grad_norm": 1.9615857601165771, "learning_rate": 5.373650467932122e-05, "loss": 0.2582, "step": 320 }, { "epoch": 5.08, "grad_norm": 1.4868065118789673, "learning_rate": 5.124653458690365e-05, "loss": 0.2131, "step": 330 }, { "epoch": 5.23, "grad_norm": 2.2800121307373047, "learning_rate": 4.875346541309637e-05, "loss": 0.1242, "step": 340 }, { "epoch": 5.38, "grad_norm": 1.9804664850234985, "learning_rate": 4.626349532067879e-05, "loss": 0.1135, "step": 350 }, { "epoch": 5.38, "eval_loss": 1.902097463607788, "eval_runtime": 1.8585, "eval_samples_per_second": 147.969, "eval_steps_per_second": 6.457, "step": 350 }, { "epoch": 5.54, "grad_norm": 2.2013049125671387, "learning_rate": 4.378281476762576e-05, "loss": 0.1092, "step": 360 }, { "epoch": 5.69, "grad_norm": 2.5137171745300293, "learning_rate": 4.131759111665349e-05, "loss": 0.1171, "step": 370 }, { "epoch": 5.85, "grad_norm": 1.885507583618164, "learning_rate": 3.887395330218429e-05, "loss": 0.1155, "step": 380 }, { "epoch": 6.0, "grad_norm": 1.9361807107925415, "learning_rate": 3.6457976592849754e-05, "loss": 0.1274, "step": 390 }, { "epoch": 6.15, "grad_norm": 1.8210408687591553, "learning_rate": 3.4075667487415785e-05, "loss": 0.0511, "step": 400 }, { "epoch": 6.15, "eval_loss": 2.216161012649536, "eval_runtime": 1.8599, "eval_samples_per_second": 147.855, "eval_steps_per_second": 6.452, "step": 400 }, { "epoch": 6.31, "grad_norm": 1.851035475730896, "learning_rate": 3.173294878168025e-05, "loss": 0.0489, "step": 410 }, { "epoch": 6.46, "grad_norm": 1.3450051546096802, "learning_rate": 2.9435644843469436e-05, "loss": 0.0419, "step": 420 }, { "epoch": 6.62, "grad_norm": 1.9799576997756958, "learning_rate": 2.718946713234185e-05, "loss": 0.053, "step": 430 }, { "epoch": 6.77, "grad_norm": 1.1950169801712036, "learning_rate": 2.500000000000001e-05, "loss": 0.0422, "step": 440 }, { "epoch": 6.92, "grad_norm": 1.1742631196975708, "learning_rate": 2.2872686806712035e-05, "loss": 0.049, "step": 450 }, { "epoch": 6.92, "eval_loss": 2.0983123779296875, "eval_runtime": 1.8568, "eval_samples_per_second": 148.106, "eval_steps_per_second": 6.463, "step": 450 }, { "epoch": 7.08, "grad_norm": 0.7656117081642151, "learning_rate": 2.0812816388260518e-05, "loss": 0.0362, "step": 460 }, { "epoch": 7.23, "grad_norm": 1.1246815919876099, "learning_rate": 1.8825509907063327e-05, "loss": 0.0207, "step": 470 }, { "epoch": 7.38, "grad_norm": 0.8675357699394226, "learning_rate": 1.691570812015704e-05, "loss": 0.0266, "step": 480 }, { "epoch": 7.54, "grad_norm": 0.9884489178657532, "learning_rate": 1.5088159095696363e-05, "loss": 0.0167, "step": 490 }, { "epoch": 7.69, "grad_norm": 0.9568442106246948, "learning_rate": 1.3347406408508695e-05, "loss": 0.0212, "step": 500 }, { "epoch": 7.69, "eval_loss": 2.3844265937805176, "eval_runtime": 1.8585, "eval_samples_per_second": 147.972, "eval_steps_per_second": 6.457, "step": 500 }, { "epoch": 7.85, "grad_norm": 0.8196859955787659, "learning_rate": 1.1697777844051105e-05, "loss": 0.0186, "step": 510 }, { "epoch": 8.0, "grad_norm": 0.6307012438774109, "learning_rate": 1.0143374638853891e-05, "loss": 0.0195, "step": 520 }, { "epoch": 8.15, "grad_norm": 0.2967863976955414, "learning_rate": 8.688061284200266e-06, "loss": 0.0087, "step": 530 }, { "epoch": 8.31, "grad_norm": 0.3737642765045166, "learning_rate": 7.33545591839222e-06, "loss": 0.0111, "step": 540 }, { "epoch": 8.46, "grad_norm": 0.8899824619293213, "learning_rate": 6.088921331488568e-06, "loss": 0.0107, "step": 550 }, { "epoch": 8.46, "eval_loss": 2.5323970317840576, "eval_runtime": 1.8601, "eval_samples_per_second": 147.844, "eval_steps_per_second": 6.451, "step": 550 }, { "epoch": 8.62, "grad_norm": 0.698161244392395, "learning_rate": 4.951556604879048e-06, "loss": 0.0102, "step": 560 }, { "epoch": 8.77, "grad_norm": 0.7657814025878906, "learning_rate": 3.9261894064796135e-06, "loss": 0.0108, "step": 570 }, { "epoch": 8.92, "grad_norm": 0.3906441926956177, "learning_rate": 3.0153689607045845e-06, "loss": 0.0105, "step": 580 }, { "epoch": 9.08, "grad_norm": 0.23798762261867523, "learning_rate": 2.221359710692961e-06, "loss": 0.0104, "step": 590 }, { "epoch": 9.23, "grad_norm": 0.18085721135139465, "learning_rate": 1.5461356885461075e-06, "loss": 0.006, "step": 600 }, { "epoch": 9.23, "eval_loss": 2.5656862258911133, "eval_runtime": 1.8572, "eval_samples_per_second": 148.074, "eval_steps_per_second": 6.461, "step": 600 }, { "epoch": 9.38, "grad_norm": 0.243088960647583, "learning_rate": 9.913756075728087e-07, "loss": 0.009, "step": 610 }, { "epoch": 9.54, "grad_norm": 0.16859322786331177, "learning_rate": 5.584586887435739e-07, "loss": 0.0077, "step": 620 }, { "epoch": 9.69, "grad_norm": 0.35239869356155396, "learning_rate": 2.4846123172992954e-07, "loss": 0.0085, "step": 630 }, { "epoch": 9.85, "grad_norm": 0.28910598158836365, "learning_rate": 6.215393905388278e-08, "loss": 0.0082, "step": 640 }, { "epoch": 10.0, "grad_norm": 0.20681537687778473, "learning_rate": 0.0, "loss": 0.0131, "step": 650 }, { "epoch": 10.0, "eval_loss": 2.5764715671539307, "eval_runtime": 1.8597, "eval_samples_per_second": 147.872, "eval_steps_per_second": 6.453, "step": 650 }, { "epoch": 10.0, "step": 650, "total_flos": 1.3867479608590336e+17, "train_loss": 0.4327951647226627, "train_runtime": 356.3752, "train_samples_per_second": 43.69, "train_steps_per_second": 1.824 } ], "logging_steps": 10, "max_steps": 650, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 1.3867479608590336e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }