| { | |
| "best_metric": 0.10525013506412506, | |
| "best_model_checkpoint": "saves/Llama-3.1-8B-Instruct/lora/sft-900/checkpoint-450", | |
| "epoch": 9.876543209876543, | |
| "eval_steps": 50, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.19753086419753085, | |
| "grad_norm": 5.285682678222656, | |
| "learning_rate": 9.000000000000001e-07, | |
| "loss": 1.6663, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.3950617283950617, | |
| "grad_norm": 7.212312698364258, | |
| "learning_rate": 1.9000000000000002e-06, | |
| "loss": 1.6571, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.5925925925925926, | |
| "grad_norm": 6.918689250946045, | |
| "learning_rate": 2.9e-06, | |
| "loss": 1.6049, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.7901234567901234, | |
| "grad_norm": 4.972626209259033, | |
| "learning_rate": 3.900000000000001e-06, | |
| "loss": 1.345, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.9876543209876543, | |
| "grad_norm": 4.738757133483887, | |
| "learning_rate": 4.9000000000000005e-06, | |
| "loss": 1.201, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.9876543209876543, | |
| "eval_loss": 1.0016428232192993, | |
| "eval_runtime": 2.2723, | |
| "eval_samples_per_second": 39.607, | |
| "eval_steps_per_second": 19.803, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.1851851851851851, | |
| "grad_norm": 3.143040180206299, | |
| "learning_rate": 4.9950668210706795e-06, | |
| "loss": 0.8783, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.382716049382716, | |
| "grad_norm": 2.624129295349121, | |
| "learning_rate": 4.978038850628855e-06, | |
| "loss": 0.6789, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.5802469135802468, | |
| "grad_norm": 2.1852622032165527, | |
| "learning_rate": 4.94893812399836e-06, | |
| "loss": 0.4397, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.7777777777777777, | |
| "grad_norm": 1.8680675029754639, | |
| "learning_rate": 4.907906416994146e-06, | |
| "loss": 0.2305, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.9753086419753085, | |
| "grad_norm": 1.5398958921432495, | |
| "learning_rate": 4.855143631968242e-06, | |
| "loss": 0.1407, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.9753086419753085, | |
| "eval_loss": 0.15125828981399536, | |
| "eval_runtime": 2.2668, | |
| "eval_samples_per_second": 39.703, | |
| "eval_steps_per_second": 19.851, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2.1728395061728394, | |
| "grad_norm": 2.09860897064209, | |
| "learning_rate": 4.790906823905599e-06, | |
| "loss": 0.1319, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 2.3703703703703702, | |
| "grad_norm": 0.7738865613937378, | |
| "learning_rate": 4.715508948078037e-06, | |
| "loss": 0.0906, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.567901234567901, | |
| "grad_norm": 0.3859764039516449, | |
| "learning_rate": 4.62931733535762e-06, | |
| "loss": 0.0772, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.765432098765432, | |
| "grad_norm": 0.847908079624176, | |
| "learning_rate": 4.5327519026175694e-06, | |
| "loss": 0.0878, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.962962962962963, | |
| "grad_norm": 0.6789230108261108, | |
| "learning_rate": 4.426283106939474e-06, | |
| "loss": 0.0885, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.962962962962963, | |
| "eval_loss": 0.10822762548923492, | |
| "eval_runtime": 2.2637, | |
| "eval_samples_per_second": 39.757, | |
| "eval_steps_per_second": 19.879, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 3.1604938271604937, | |
| "grad_norm": 0.82857745885849, | |
| "learning_rate": 4.3104296535936695e-06, | |
| "loss": 0.0712, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 3.3580246913580245, | |
| "grad_norm": 0.6130207777023315, | |
| "learning_rate": 4.185755968959308e-06, | |
| "loss": 0.0918, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 3.5555555555555554, | |
| "grad_norm": 0.8012928366661072, | |
| "learning_rate": 4.052869450695776e-06, | |
| "loss": 0.0698, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 3.753086419753086, | |
| "grad_norm": 1.5906227827072144, | |
| "learning_rate": 3.912417508562345e-06, | |
| "loss": 0.0873, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 3.950617283950617, | |
| "grad_norm": 0.6971777677536011, | |
| "learning_rate": 3.7650844103029093e-06, | |
| "loss": 0.0743, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 3.950617283950617, | |
| "eval_loss": 0.10680027306079865, | |
| "eval_runtime": 2.2702, | |
| "eval_samples_per_second": 39.644, | |
| "eval_steps_per_second": 19.822, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 4.148148148148148, | |
| "grad_norm": 0.9408888816833496, | |
| "learning_rate": 3.611587947962319e-06, | |
| "loss": 0.0622, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 4.345679012345679, | |
| "grad_norm": 0.5946834087371826, | |
| "learning_rate": 3.452675940875686e-06, | |
| "loss": 0.0883, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 4.54320987654321, | |
| "grad_norm": 0.592318594455719, | |
| "learning_rate": 3.2891225923677565e-06, | |
| "loss": 0.0546, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 4.7407407407407405, | |
| "grad_norm": 0.7733595967292786, | |
| "learning_rate": 3.121724717912138e-06, | |
| "loss": 0.052, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 4.938271604938271, | |
| "grad_norm": 0.7871303558349609, | |
| "learning_rate": 2.9512978631264006e-06, | |
| "loss": 0.0855, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 4.938271604938271, | |
| "eval_loss": 0.10621543228626251, | |
| "eval_runtime": 2.2684, | |
| "eval_samples_per_second": 39.675, | |
| "eval_steps_per_second": 19.837, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 5.135802469135802, | |
| "grad_norm": 0.4525160789489746, | |
| "learning_rate": 2.778672330515814e-06, | |
| "loss": 0.0887, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 5.333333333333333, | |
| "grad_norm": 1.1445720195770264, | |
| "learning_rate": 2.604689134322999e-06, | |
| "loss": 0.058, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 5.530864197530864, | |
| "grad_norm": 1.6000568866729736, | |
| "learning_rate": 2.4301959031910785e-06, | |
| "loss": 0.0671, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 5.728395061728395, | |
| "grad_norm": 0.7566145062446594, | |
| "learning_rate": 2.256042750602127e-06, | |
| "loss": 0.0563, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 5.925925925925926, | |
| "grad_norm": 0.996746838092804, | |
| "learning_rate": 2.0830781332097446e-06, | |
| "loss": 0.0571, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 5.925925925925926, | |
| "eval_loss": 0.10575470328330994, | |
| "eval_runtime": 2.2646, | |
| "eval_samples_per_second": 39.742, | |
| "eval_steps_per_second": 19.871, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 6.1234567901234565, | |
| "grad_norm": 0.46847003698349, | |
| "learning_rate": 1.912144717243525e-06, | |
| "loss": 0.0644, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 6.320987654320987, | |
| "grad_norm": 0.6955059766769409, | |
| "learning_rate": 1.744075273123889e-06, | |
| "loss": 0.0574, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 6.518518518518518, | |
| "grad_norm": 1.0270895957946777, | |
| "learning_rate": 1.5796886182883053e-06, | |
| "loss": 0.0642, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 6.716049382716049, | |
| "grad_norm": 0.8975910544395447, | |
| "learning_rate": 1.419785627995044e-06, | |
| "loss": 0.0584, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 6.91358024691358, | |
| "grad_norm": 1.245640754699707, | |
| "learning_rate": 1.2651453335394232e-06, | |
| "loss": 0.063, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 6.91358024691358, | |
| "eval_loss": 0.10544978082180023, | |
| "eval_runtime": 2.2637, | |
| "eval_samples_per_second": 39.758, | |
| "eval_steps_per_second": 19.879, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 7.111111111111111, | |
| "grad_norm": 0.7810999155044556, | |
| "learning_rate": 1.11652112689164e-06, | |
| "loss": 0.0585, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 7.308641975308642, | |
| "grad_norm": 1.3329459428787231, | |
| "learning_rate": 9.746370902468311e-07, | |
| "loss": 0.0556, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 7.506172839506172, | |
| "grad_norm": 0.9610387086868286, | |
| "learning_rate": 8.40184468369396e-07, | |
| "loss": 0.0665, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 7.703703703703704, | |
| "grad_norm": 1.5703972578048706, | |
| "learning_rate": 7.138183009179922e-07, | |
| "loss": 0.0513, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 7.901234567901234, | |
| "grad_norm": 0.517612636089325, | |
| "learning_rate": 5.961542311581586e-07, | |
| "loss": 0.0597, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 7.901234567901234, | |
| "eval_loss": 0.10566332191228867, | |
| "eval_runtime": 2.265, | |
| "eval_samples_per_second": 39.736, | |
| "eval_steps_per_second": 19.868, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 8.098765432098766, | |
| "grad_norm": 0.3458334803581238, | |
| "learning_rate": 4.87765506610215e-07, | |
| "loss": 0.0481, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 8.296296296296296, | |
| "grad_norm": 1.3999119997024536, | |
| "learning_rate": 3.891801862449629e-07, | |
| "loss": 0.0474, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 8.493827160493828, | |
| "grad_norm": 1.124482274055481, | |
| "learning_rate": 3.0087856783345916e-07, | |
| "loss": 0.0455, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 8.691358024691358, | |
| "grad_norm": 1.1105051040649414, | |
| "learning_rate": 2.2329084798455747e-07, | |
| "loss": 0.0609, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 8.88888888888889, | |
| "grad_norm": 1.2002372741699219, | |
| "learning_rate": 1.567950262702714e-07, | |
| "loss": 0.0694, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 8.88888888888889, | |
| "eval_loss": 0.10525013506412506, | |
| "eval_runtime": 2.2672, | |
| "eval_samples_per_second": 39.696, | |
| "eval_steps_per_second": 19.848, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 9.08641975308642, | |
| "grad_norm": 0.9672008752822876, | |
| "learning_rate": 1.0171506364985622e-07, | |
| "loss": 0.0483, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 9.283950617283951, | |
| "grad_norm": 0.4676777422428131, | |
| "learning_rate": 5.83193041645802e-08, | |
| "loss": 0.0691, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 9.481481481481481, | |
| "grad_norm": 2.014739751815796, | |
| "learning_rate": 2.681916759252917e-08, | |
| "loss": 0.0612, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 9.679012345679013, | |
| "grad_norm": 1.5374245643615723, | |
| "learning_rate": 7.368119432699383e-09, | |
| "loss": 0.0461, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 9.876543209876543, | |
| "grad_norm": 0.8854678273200989, | |
| "learning_rate": 6.092323651313293e-11, | |
| "loss": 0.0593, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 9.876543209876543, | |
| "eval_loss": 0.10525856912136078, | |
| "eval_runtime": 2.2702, | |
| "eval_samples_per_second": 39.645, | |
| "eval_steps_per_second": 19.822, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 9.876543209876543, | |
| "step": 500, | |
| "total_flos": 4.40152941896663e+16, | |
| "train_loss": 0.2507400060892105, | |
| "train_runtime": 715.6643, | |
| "train_samples_per_second": 11.318, | |
| "train_steps_per_second": 0.699 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.40152941896663e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |