| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.977777777777778, | |
| "eval_steps": 500, | |
| "global_step": 560, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.044444444444444446, | |
| "grad_norm": 32.500944042241564, | |
| "learning_rate": 4.4642857142857147e-07, | |
| "loss": 2.7887, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.08888888888888889, | |
| "grad_norm": 40.795635794015915, | |
| "learning_rate": 8.928571428571429e-07, | |
| "loss": 2.5988, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.13333333333333333, | |
| "grad_norm": 25.628980626256435, | |
| "learning_rate": 1.3392857142857143e-06, | |
| "loss": 2.4923, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.17777777777777778, | |
| "grad_norm": 19.792522935498546, | |
| "learning_rate": 1.7857142857142859e-06, | |
| "loss": 2.1781, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.2222222222222222, | |
| "grad_norm": 9.121228345410586, | |
| "learning_rate": 2.2321428571428573e-06, | |
| "loss": 1.7957, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.26666666666666666, | |
| "grad_norm": 9.39951433523568, | |
| "learning_rate": 2.6785714285714285e-06, | |
| "loss": 1.7309, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.3111111111111111, | |
| "grad_norm": 7.067368203953625, | |
| "learning_rate": 3.125e-06, | |
| "loss": 1.6744, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.35555555555555557, | |
| "grad_norm": 9.451965886898426, | |
| "learning_rate": 3.5714285714285718e-06, | |
| "loss": 1.5462, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 4.915304589330825, | |
| "learning_rate": 4.017857142857143e-06, | |
| "loss": 1.5017, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 5.750050607520634, | |
| "learning_rate": 4.464285714285715e-06, | |
| "loss": 1.4425, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.4888888888888889, | |
| "grad_norm": 5.376259477080128, | |
| "learning_rate": 4.910714285714286e-06, | |
| "loss": 1.4034, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 4.768973432623137, | |
| "learning_rate": 5.357142857142857e-06, | |
| "loss": 1.3321, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.5777777777777777, | |
| "grad_norm": 4.538004676055893, | |
| "learning_rate": 5.8035714285714295e-06, | |
| "loss": 1.3438, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.6222222222222222, | |
| "grad_norm": 5.363034835371456, | |
| "learning_rate": 6.25e-06, | |
| "loss": 1.301, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 6.967003160329876, | |
| "learning_rate": 6.696428571428571e-06, | |
| "loss": 1.2855, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.7111111111111111, | |
| "grad_norm": 4.742995954295704, | |
| "learning_rate": 7.1428571428571436e-06, | |
| "loss": 1.3305, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.7555555555555555, | |
| "grad_norm": 5.390782810525758, | |
| "learning_rate": 7.589285714285714e-06, | |
| "loss": 1.2897, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 4.68271065441972, | |
| "learning_rate": 8.035714285714286e-06, | |
| "loss": 1.2756, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.8444444444444444, | |
| "grad_norm": 4.043774521320921, | |
| "learning_rate": 8.482142857142858e-06, | |
| "loss": 1.2842, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 4.446998328249908, | |
| "learning_rate": 8.92857142857143e-06, | |
| "loss": 1.2697, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.9333333333333333, | |
| "grad_norm": 4.572696006859408, | |
| "learning_rate": 9.375000000000001e-06, | |
| "loss": 1.2443, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.9777777777777777, | |
| "grad_norm": 4.9707682732469705, | |
| "learning_rate": 9.821428571428573e-06, | |
| "loss": 1.2141, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.0222222222222221, | |
| "grad_norm": 4.178825143377483, | |
| "learning_rate": 9.933035714285715e-06, | |
| "loss": 1.1366, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.0666666666666667, | |
| "grad_norm": 4.5133063603083565, | |
| "learning_rate": 9.821428571428573e-06, | |
| "loss": 1.0292, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.1111111111111112, | |
| "grad_norm": 4.162274701520531, | |
| "learning_rate": 9.70982142857143e-06, | |
| "loss": 1.0506, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.1555555555555554, | |
| "grad_norm": 5.452346452279286, | |
| "learning_rate": 9.598214285714287e-06, | |
| "loss": 0.9709, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 4.781896722092587, | |
| "learning_rate": 9.486607142857144e-06, | |
| "loss": 0.9563, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.2444444444444445, | |
| "grad_norm": 4.445046867948941, | |
| "learning_rate": 9.375000000000001e-06, | |
| "loss": 0.9895, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.2888888888888888, | |
| "grad_norm": 4.758711829216556, | |
| "learning_rate": 9.263392857142858e-06, | |
| "loss": 0.9801, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 4.609910361855944, | |
| "learning_rate": 9.151785714285715e-06, | |
| "loss": 1.0064, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.3777777777777778, | |
| "grad_norm": 4.87480001558543, | |
| "learning_rate": 9.040178571428572e-06, | |
| "loss": 0.9139, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.4222222222222223, | |
| "grad_norm": 5.32380555846839, | |
| "learning_rate": 8.92857142857143e-06, | |
| "loss": 0.9636, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.4666666666666668, | |
| "grad_norm": 4.8867045100692215, | |
| "learning_rate": 8.816964285714286e-06, | |
| "loss": 1.0211, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.511111111111111, | |
| "grad_norm": 4.957309018955932, | |
| "learning_rate": 8.705357142857143e-06, | |
| "loss": 0.979, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.5555555555555556, | |
| "grad_norm": 5.073057357471921, | |
| "learning_rate": 8.59375e-06, | |
| "loss": 0.9458, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 5.44812503643663, | |
| "learning_rate": 8.482142857142858e-06, | |
| "loss": 0.9874, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.6444444444444444, | |
| "grad_norm": 4.882464064452815, | |
| "learning_rate": 8.370535714285715e-06, | |
| "loss": 0.9544, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.6888888888888889, | |
| "grad_norm": 5.028418460708894, | |
| "learning_rate": 8.258928571428572e-06, | |
| "loss": 0.9767, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.7333333333333334, | |
| "grad_norm": 5.161967968478642, | |
| "learning_rate": 8.147321428571429e-06, | |
| "loss": 0.8801, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.7777777777777777, | |
| "grad_norm": 5.235225489917799, | |
| "learning_rate": 8.035714285714286e-06, | |
| "loss": 0.9793, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.8222222222222222, | |
| "grad_norm": 4.707730730496147, | |
| "learning_rate": 7.924107142857143e-06, | |
| "loss": 1.0178, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.8666666666666667, | |
| "grad_norm": 4.805494574009503, | |
| "learning_rate": 7.8125e-06, | |
| "loss": 0.9473, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.911111111111111, | |
| "grad_norm": 4.499773028240189, | |
| "learning_rate": 7.700892857142857e-06, | |
| "loss": 0.9764, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.9555555555555557, | |
| "grad_norm": 4.643876487232801, | |
| "learning_rate": 7.589285714285714e-06, | |
| "loss": 0.938, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 5.871600124695231, | |
| "learning_rate": 7.4776785714285714e-06, | |
| "loss": 0.9604, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.0444444444444443, | |
| "grad_norm": 4.167242587255422, | |
| "learning_rate": 7.366071428571429e-06, | |
| "loss": 0.5353, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.088888888888889, | |
| "grad_norm": 4.379267414326139, | |
| "learning_rate": 7.2544642857142865e-06, | |
| "loss": 0.4706, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 2.1333333333333333, | |
| "grad_norm": 6.644473939577757, | |
| "learning_rate": 7.1428571428571436e-06, | |
| "loss": 0.46, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.1777777777777776, | |
| "grad_norm": 5.288572083384898, | |
| "learning_rate": 7.031250000000001e-06, | |
| "loss": 0.4504, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.2222222222222223, | |
| "grad_norm": 4.803175146194689, | |
| "learning_rate": 6.919642857142858e-06, | |
| "loss": 0.4555, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.2666666666666666, | |
| "grad_norm": 4.727379036726369, | |
| "learning_rate": 6.808035714285715e-06, | |
| "loss": 0.4442, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.311111111111111, | |
| "grad_norm": 6.018076787689891, | |
| "learning_rate": 6.696428571428571e-06, | |
| "loss": 0.4521, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.3555555555555556, | |
| "grad_norm": 5.308280827222846, | |
| "learning_rate": 6.58482142857143e-06, | |
| "loss": 0.4575, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 5.151088902332344, | |
| "learning_rate": 6.473214285714287e-06, | |
| "loss": 0.4549, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.4444444444444446, | |
| "grad_norm": 5.22916509750735, | |
| "learning_rate": 6.361607142857143e-06, | |
| "loss": 0.4321, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.488888888888889, | |
| "grad_norm": 6.577328994449813, | |
| "learning_rate": 6.25e-06, | |
| "loss": 0.4752, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.533333333333333, | |
| "grad_norm": 4.620136232064613, | |
| "learning_rate": 6.138392857142857e-06, | |
| "loss": 0.4309, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 2.5777777777777775, | |
| "grad_norm": 5.263417646651918, | |
| "learning_rate": 6.0267857142857145e-06, | |
| "loss": 0.4748, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.6222222222222222, | |
| "grad_norm": 6.104062115464887, | |
| "learning_rate": 5.9151785714285716e-06, | |
| "loss": 0.4645, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 5.547768245944595, | |
| "learning_rate": 5.8035714285714295e-06, | |
| "loss": 0.4482, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.7111111111111112, | |
| "grad_norm": 5.127865048666611, | |
| "learning_rate": 5.691964285714287e-06, | |
| "loss": 0.4553, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 2.7555555555555555, | |
| "grad_norm": 5.69799733801961, | |
| "learning_rate": 5.580357142857144e-06, | |
| "loss": 0.455, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 5.551292184720094, | |
| "learning_rate": 5.468750000000001e-06, | |
| "loss": 0.4427, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 2.8444444444444446, | |
| "grad_norm": 4.742335226769835, | |
| "learning_rate": 5.357142857142857e-06, | |
| "loss": 0.4784, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.888888888888889, | |
| "grad_norm": 5.565975549397164, | |
| "learning_rate": 5.245535714285714e-06, | |
| "loss": 0.4413, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.9333333333333336, | |
| "grad_norm": 5.172281403555286, | |
| "learning_rate": 5.133928571428571e-06, | |
| "loss": 0.4834, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.977777777777778, | |
| "grad_norm": 6.330245277622714, | |
| "learning_rate": 5.022321428571429e-06, | |
| "loss": 0.4549, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 3.022222222222222, | |
| "grad_norm": 3.415282402532009, | |
| "learning_rate": 4.910714285714286e-06, | |
| "loss": 0.3147, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 3.066666666666667, | |
| "grad_norm": 4.166785388645218, | |
| "learning_rate": 4.799107142857143e-06, | |
| "loss": 0.1504, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 3.111111111111111, | |
| "grad_norm": 6.851997842026729, | |
| "learning_rate": 4.6875000000000004e-06, | |
| "loss": 0.141, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 3.1555555555555554, | |
| "grad_norm": 3.7642546393186667, | |
| "learning_rate": 4.5758928571428575e-06, | |
| "loss": 0.125, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 5.325591448299647, | |
| "learning_rate": 4.464285714285715e-06, | |
| "loss": 0.1188, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 3.2444444444444445, | |
| "grad_norm": 5.527673533546325, | |
| "learning_rate": 4.352678571428572e-06, | |
| "loss": 0.1201, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 3.2888888888888888, | |
| "grad_norm": 3.759586790759837, | |
| "learning_rate": 4.241071428571429e-06, | |
| "loss": 0.1179, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 3.3333333333333335, | |
| "grad_norm": 4.415041949203345, | |
| "learning_rate": 4.129464285714286e-06, | |
| "loss": 0.1309, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 3.3777777777777778, | |
| "grad_norm": 4.284866459571792, | |
| "learning_rate": 4.017857142857143e-06, | |
| "loss": 0.1041, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.422222222222222, | |
| "grad_norm": 5.196722964569633, | |
| "learning_rate": 3.90625e-06, | |
| "loss": 0.1111, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 3.466666666666667, | |
| "grad_norm": 4.591253928929354, | |
| "learning_rate": 3.794642857142857e-06, | |
| "loss": 0.1061, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.511111111111111, | |
| "grad_norm": 4.5848311961810095, | |
| "learning_rate": 3.6830357142857147e-06, | |
| "loss": 0.1148, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 3.5555555555555554, | |
| "grad_norm": 4.200054025746849, | |
| "learning_rate": 3.5714285714285718e-06, | |
| "loss": 0.1226, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 5.7963405952838, | |
| "learning_rate": 3.459821428571429e-06, | |
| "loss": 0.1315, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 3.6444444444444444, | |
| "grad_norm": 3.955311966290277, | |
| "learning_rate": 3.3482142857142855e-06, | |
| "loss": 0.103, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 3.688888888888889, | |
| "grad_norm": 4.566987235058567, | |
| "learning_rate": 3.2366071428571435e-06, | |
| "loss": 0.1141, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 3.7333333333333334, | |
| "grad_norm": 4.294145248113143, | |
| "learning_rate": 3.125e-06, | |
| "loss": 0.1149, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 3.7777777777777777, | |
| "grad_norm": 4.2303933330811265, | |
| "learning_rate": 3.0133928571428572e-06, | |
| "loss": 0.1167, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 3.822222222222222, | |
| "grad_norm": 4.383212198303789, | |
| "learning_rate": 2.9017857142857148e-06, | |
| "loss": 0.1104, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 3.8666666666666667, | |
| "grad_norm": 4.659228963692107, | |
| "learning_rate": 2.790178571428572e-06, | |
| "loss": 0.126, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 3.911111111111111, | |
| "grad_norm": 4.553953752595745, | |
| "learning_rate": 2.6785714285714285e-06, | |
| "loss": 0.1018, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 3.9555555555555557, | |
| "grad_norm": 4.210679100865541, | |
| "learning_rate": 2.5669642857142856e-06, | |
| "loss": 0.1171, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 4.658526925117024, | |
| "learning_rate": 2.455357142857143e-06, | |
| "loss": 0.1114, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 4.044444444444444, | |
| "grad_norm": 2.070396358155716, | |
| "learning_rate": 2.3437500000000002e-06, | |
| "loss": 0.0308, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 4.088888888888889, | |
| "grad_norm": 1.6558418128729855, | |
| "learning_rate": 2.2321428571428573e-06, | |
| "loss": 0.0302, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 4.133333333333334, | |
| "grad_norm": 1.9493882647401612, | |
| "learning_rate": 2.1205357142857144e-06, | |
| "loss": 0.0253, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 4.177777777777778, | |
| "grad_norm": 3.1324543060876486, | |
| "learning_rate": 2.0089285714285715e-06, | |
| "loss": 0.025, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 4.222222222222222, | |
| "grad_norm": 2.4431619806408924, | |
| "learning_rate": 1.8973214285714286e-06, | |
| "loss": 0.0205, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 4.266666666666667, | |
| "grad_norm": 2.7393785953503196, | |
| "learning_rate": 1.7857142857142859e-06, | |
| "loss": 0.0226, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 4.311111111111111, | |
| "grad_norm": 2.5611198617550546, | |
| "learning_rate": 1.6741071428571428e-06, | |
| "loss": 0.0199, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 4.355555555555555, | |
| "grad_norm": 1.5062765363432846, | |
| "learning_rate": 1.5625e-06, | |
| "loss": 0.0221, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 4.4, | |
| "grad_norm": 1.2431012289310222, | |
| "learning_rate": 1.4508928571428574e-06, | |
| "loss": 0.0209, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 4.444444444444445, | |
| "grad_norm": 2.1836083399913266, | |
| "learning_rate": 1.3392857142857143e-06, | |
| "loss": 0.0229, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 4.488888888888889, | |
| "grad_norm": 1.8258818292695604, | |
| "learning_rate": 1.2276785714285716e-06, | |
| "loss": 0.0189, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 4.533333333333333, | |
| "grad_norm": 2.6358008650733162, | |
| "learning_rate": 1.1160714285714287e-06, | |
| "loss": 0.0237, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 4.5777777777777775, | |
| "grad_norm": 1.6534473635236113, | |
| "learning_rate": 1.0044642857142857e-06, | |
| "loss": 0.021, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 4.622222222222222, | |
| "grad_norm": 1.7965235570696243, | |
| "learning_rate": 8.928571428571429e-07, | |
| "loss": 0.0223, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 4.666666666666667, | |
| "grad_norm": 1.99467714545676, | |
| "learning_rate": 7.8125e-07, | |
| "loss": 0.0187, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 4.711111111111111, | |
| "grad_norm": 2.383918007165847, | |
| "learning_rate": 6.696428571428571e-07, | |
| "loss": 0.0208, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 4.7555555555555555, | |
| "grad_norm": 2.9389871985963367, | |
| "learning_rate": 5.580357142857143e-07, | |
| "loss": 0.019, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 3.156468380456551, | |
| "learning_rate": 4.4642857142857147e-07, | |
| "loss": 0.0209, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 4.844444444444444, | |
| "grad_norm": 1.2846734469252965, | |
| "learning_rate": 3.3482142857142856e-07, | |
| "loss": 0.0199, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 4.888888888888889, | |
| "grad_norm": 1.436802701322044, | |
| "learning_rate": 2.2321428571428574e-07, | |
| "loss": 0.0213, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 4.933333333333334, | |
| "grad_norm": 3.39880105752545, | |
| "learning_rate": 1.1160714285714287e-07, | |
| "loss": 0.0255, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 4.977777777777778, | |
| "grad_norm": 2.137870213194104, | |
| "learning_rate": 0.0, | |
| "loss": 0.0204, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 4.977777777777778, | |
| "step": 560, | |
| "total_flos": 2469603901440.0, | |
| "train_loss": 0.6376679654233157, | |
| "train_runtime": 7355.0643, | |
| "train_samples_per_second": 2.447, | |
| "train_steps_per_second": 0.076 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 560, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2469603901440.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |