{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 9.96, "eval_steps": 500, "global_step": 1563, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0448, "grad_norm": 0.6615037322044373, "learning_rate": 0.00019987163029525033, "loss": 0.8977, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 7, "tokens_per_second_per_gpu": 2592.29 }, { "epoch": 0.0896, "grad_norm": 0.24950779974460602, "learning_rate": 0.00019897304236200257, "loss": 0.5247, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 14, "tokens_per_second_per_gpu": 2595.43 }, { "epoch": 0.1344, "grad_norm": 0.23673032224178314, "learning_rate": 0.00019807445442875484, "loss": 0.4909, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 21, "tokens_per_second_per_gpu": 2570.26 }, { "epoch": 0.1792, "grad_norm": 0.30793243646621704, "learning_rate": 0.00019717586649550708, "loss": 0.4704, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 28, "tokens_per_second_per_gpu": 2572.23 }, { "epoch": 0.224, "grad_norm": 0.2643095552921295, "learning_rate": 0.00019627727856225932, "loss": 0.4953, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 35, "tokens_per_second_per_gpu": 2555.05 }, { "epoch": 0.2688, "grad_norm": 0.22326567769050598, "learning_rate": 0.00019537869062901157, "loss": 0.4605, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 42, "tokens_per_second_per_gpu": 2517.02 }, { "epoch": 0.3136, "grad_norm": 0.24139587581157684, "learning_rate": 0.0001944801026957638, "loss": 0.4491, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 49, "tokens_per_second_per_gpu": 2542.62 }, { "epoch": 0.3584, "grad_norm": 0.22835120558738708, "learning_rate": 0.00019358151476251605, "loss": 0.4365, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 56, "tokens_per_second_per_gpu": 2522.46 }, { "epoch": 0.4032, "grad_norm": 0.31657782196998596, "learning_rate": 0.0001926829268292683, "loss": 0.4421, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 63, "tokens_per_second_per_gpu": 2559.09 }, { "epoch": 0.448, "grad_norm": 0.3069691061973572, "learning_rate": 0.00019178433889602053, "loss": 0.4296, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 70, "tokens_per_second_per_gpu": 2559.25 }, { "epoch": 0.4928, "grad_norm": 0.24737463891506195, "learning_rate": 0.0001908857509627728, "loss": 0.4337, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 77, "tokens_per_second_per_gpu": 2548.39 }, { "epoch": 0.5376, "grad_norm": 0.21972940862178802, "learning_rate": 0.00018998716302952504, "loss": 0.4109, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 84, "tokens_per_second_per_gpu": 2540.14 }, { "epoch": 0.5824, "grad_norm": 0.2903401851654053, "learning_rate": 0.00018908857509627728, "loss": 0.435, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 91, "tokens_per_second_per_gpu": 2551.73 }, { "epoch": 0.6272, "grad_norm": 0.24281612038612366, "learning_rate": 0.00018818998716302953, "loss": 0.421, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 98, "tokens_per_second_per_gpu": 2552.13 }, { "epoch": 0.672, "grad_norm": 0.22035975754261017, "learning_rate": 0.0001872913992297818, "loss": 0.4311, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 105, "tokens_per_second_per_gpu": 2545.85 }, { "epoch": 0.7168, "grad_norm": 0.3112412095069885, "learning_rate": 0.00018639281129653404, "loss": 0.4382, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 112, "tokens_per_second_per_gpu": 2545.28 }, { "epoch": 0.7616, "grad_norm": 0.281745582818985, "learning_rate": 0.00018549422336328628, "loss": 0.4292, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 119, "tokens_per_second_per_gpu": 2557.38 }, { "epoch": 0.8064, "grad_norm": 0.3101930022239685, "learning_rate": 0.00018459563543003852, "loss": 0.4092, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 126, "tokens_per_second_per_gpu": 2561.1 }, { "epoch": 0.8512, "grad_norm": 0.27449899911880493, "learning_rate": 0.0001836970474967908, "loss": 0.4163, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 133, "tokens_per_second_per_gpu": 2552.95 }, { "epoch": 0.896, "grad_norm": 0.275149405002594, "learning_rate": 0.00018279845956354303, "loss": 0.4373, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 140, "tokens_per_second_per_gpu": 2541.22 }, { "epoch": 0.9408, "grad_norm": 0.2836650013923645, "learning_rate": 0.00018189987163029524, "loss": 0.4055, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 147, "tokens_per_second_per_gpu": 2524.47 }, { "epoch": 0.9856, "grad_norm": 0.23854206502437592, "learning_rate": 0.00018100128369704749, "loss": 0.3914, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 154, "tokens_per_second_per_gpu": 2562.91 }, { "epoch": 1.0256, "grad_norm": 0.25312554836273193, "learning_rate": 0.00018010269576379976, "loss": 0.4056, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 161, "tokens_per_second_per_gpu": 2280.67 }, { "epoch": 1.0704, "grad_norm": 0.3411436676979065, "learning_rate": 0.000179204107830552, "loss": 0.4032, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 168, "tokens_per_second_per_gpu": 2514.49 }, { "epoch": 1.1152, "grad_norm": 0.31019991636276245, "learning_rate": 0.00017830551989730424, "loss": 0.3777, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 175, "tokens_per_second_per_gpu": 2576.04 }, { "epoch": 1.16, "grad_norm": 0.33959564566612244, "learning_rate": 0.00017740693196405648, "loss": 0.3781, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 182, "tokens_per_second_per_gpu": 2550.58 }, { "epoch": 1.2048, "grad_norm": 0.3206247389316559, "learning_rate": 0.00017650834403080875, "loss": 0.3888, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 189, "tokens_per_second_per_gpu": 2555.84 }, { "epoch": 1.2496, "grad_norm": 0.2893042266368866, "learning_rate": 0.000175609756097561, "loss": 0.3905, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 196, "tokens_per_second_per_gpu": 2554.01 }, { "epoch": 1.2944, "grad_norm": 0.30622121691703796, "learning_rate": 0.00017471116816431323, "loss": 0.3998, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 203, "tokens_per_second_per_gpu": 2563.48 }, { "epoch": 1.3392, "grad_norm": 0.3980805575847626, "learning_rate": 0.00017381258023106547, "loss": 0.391, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 210, "tokens_per_second_per_gpu": 2534.96 }, { "epoch": 1.384, "grad_norm": 0.3588064908981323, "learning_rate": 0.00017291399229781772, "loss": 0.3742, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 217, "tokens_per_second_per_gpu": 2560.36 }, { "epoch": 1.4288, "grad_norm": 0.5023795366287231, "learning_rate": 0.00017201540436456998, "loss": 0.3862, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 224, "tokens_per_second_per_gpu": 2578.01 }, { "epoch": 1.4736, "grad_norm": 0.3610862195491791, "learning_rate": 0.00017111681643132223, "loss": 0.3788, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 231, "tokens_per_second_per_gpu": 2527.8 }, { "epoch": 1.5184, "grad_norm": 0.3025442957878113, "learning_rate": 0.00017021822849807447, "loss": 0.3772, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 238, "tokens_per_second_per_gpu": 2520.21 }, { "epoch": 1.5632000000000001, "grad_norm": 0.27140572667121887, "learning_rate": 0.0001693196405648267, "loss": 0.3645, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 245, "tokens_per_second_per_gpu": 2570.72 }, { "epoch": 1.608, "grad_norm": 0.28411632776260376, "learning_rate": 0.00016842105263157895, "loss": 0.3857, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 252, "tokens_per_second_per_gpu": 2542.71 }, { "epoch": 1.6528, "grad_norm": 0.3377877473831177, "learning_rate": 0.0001675224646983312, "loss": 0.392, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 259, "tokens_per_second_per_gpu": 2530.43 }, { "epoch": 1.6976, "grad_norm": 0.30815646052360535, "learning_rate": 0.00016662387676508343, "loss": 0.3862, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 266, "tokens_per_second_per_gpu": 2558.4 }, { "epoch": 1.7424, "grad_norm": 0.3454481065273285, "learning_rate": 0.0001657252888318357, "loss": 0.3827, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 273, "tokens_per_second_per_gpu": 2563.37 }, { "epoch": 1.7872, "grad_norm": 0.30998337268829346, "learning_rate": 0.00016482670089858794, "loss": 0.3735, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 280, "tokens_per_second_per_gpu": 2513.39 }, { "epoch": 1.8319999999999999, "grad_norm": 0.30814942717552185, "learning_rate": 0.0001639281129653402, "loss": 0.3702, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 287, "tokens_per_second_per_gpu": 2562.1 }, { "epoch": 1.8768, "grad_norm": 0.39205342531204224, "learning_rate": 0.00016302952503209243, "loss": 0.3585, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 294, "tokens_per_second_per_gpu": 2530.89 }, { "epoch": 1.9216, "grad_norm": 0.3053893446922302, "learning_rate": 0.00016213093709884467, "loss": 0.3831, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 301, "tokens_per_second_per_gpu": 2569.93 }, { "epoch": 1.9664000000000001, "grad_norm": 0.31996455788612366, "learning_rate": 0.00016123234916559694, "loss": 0.3691, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 308, "tokens_per_second_per_gpu": 2552.79 }, { "epoch": 2.0064, "grad_norm": 0.3335410952568054, "learning_rate": 0.00016033376123234918, "loss": 0.3779, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 315, "tokens_per_second_per_gpu": 2257.59 }, { "epoch": 2.0512, "grad_norm": 0.4308569133281708, "learning_rate": 0.00015943517329910142, "loss": 0.3415, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 322, "tokens_per_second_per_gpu": 2547.15 }, { "epoch": 2.096, "grad_norm": 0.3975169360637665, "learning_rate": 0.00015853658536585366, "loss": 0.3491, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 329, "tokens_per_second_per_gpu": 2522.76 }, { "epoch": 2.1408, "grad_norm": 0.40929752588272095, "learning_rate": 0.00015763799743260593, "loss": 0.3134, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 336, "tokens_per_second_per_gpu": 2544.97 }, { "epoch": 2.1856, "grad_norm": 0.4193170964717865, "learning_rate": 0.00015673940949935817, "loss": 0.3166, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 343, "tokens_per_second_per_gpu": 2568.46 }, { "epoch": 2.2304, "grad_norm": 0.48210176825523376, "learning_rate": 0.0001558408215661104, "loss": 0.3217, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 350, "tokens_per_second_per_gpu": 2536.96 }, { "epoch": 2.2752, "grad_norm": 0.49080193042755127, "learning_rate": 0.00015494223363286263, "loss": 0.306, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 357, "tokens_per_second_per_gpu": 2578.61 }, { "epoch": 2.32, "grad_norm": 0.49059173464775085, "learning_rate": 0.0001540436456996149, "loss": 0.3271, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 364, "tokens_per_second_per_gpu": 2553.75 }, { "epoch": 2.3648, "grad_norm": 0.43335044384002686, "learning_rate": 0.00015314505776636714, "loss": 0.3217, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 371, "tokens_per_second_per_gpu": 2545.92 }, { "epoch": 2.4096, "grad_norm": 0.44906413555145264, "learning_rate": 0.00015224646983311938, "loss": 0.3185, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 378, "tokens_per_second_per_gpu": 2549.23 }, { "epoch": 2.4544, "grad_norm": 0.4638211727142334, "learning_rate": 0.00015134788189987162, "loss": 0.322, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 385, "tokens_per_second_per_gpu": 2535.31 }, { "epoch": 2.4992, "grad_norm": 0.46060240268707275, "learning_rate": 0.0001504492939666239, "loss": 0.3148, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 392, "tokens_per_second_per_gpu": 2553.77 }, { "epoch": 2.544, "grad_norm": 0.4542367160320282, "learning_rate": 0.00014955070603337613, "loss": 0.3079, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 399, "tokens_per_second_per_gpu": 2545.7 }, { "epoch": 2.5888, "grad_norm": 0.42871296405792236, "learning_rate": 0.00014865211810012838, "loss": 0.3135, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 406, "tokens_per_second_per_gpu": 2557.89 }, { "epoch": 2.6336, "grad_norm": 0.4550248086452484, "learning_rate": 0.00014775353016688062, "loss": 0.3198, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 413, "tokens_per_second_per_gpu": 2565.86 }, { "epoch": 2.6784, "grad_norm": 0.47693103551864624, "learning_rate": 0.0001468549422336329, "loss": 0.3427, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 420, "tokens_per_second_per_gpu": 2536.18 }, { "epoch": 2.7232, "grad_norm": 0.5266048312187195, "learning_rate": 0.00014595635430038513, "loss": 0.3204, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 427, "tokens_per_second_per_gpu": 2565.95 }, { "epoch": 2.768, "grad_norm": 0.4391225576400757, "learning_rate": 0.00014505776636713737, "loss": 0.3241, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 434, "tokens_per_second_per_gpu": 2509.53 }, { "epoch": 2.8128, "grad_norm": 0.5922185778617859, "learning_rate": 0.0001441591784338896, "loss": 0.3104, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 441, "tokens_per_second_per_gpu": 2529.18 }, { "epoch": 2.8576, "grad_norm": 0.5623916983604431, "learning_rate": 0.00014326059050064185, "loss": 0.3122, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 448, "tokens_per_second_per_gpu": 2563.27 }, { "epoch": 2.9024, "grad_norm": 0.49947455525398254, "learning_rate": 0.0001423620025673941, "loss": 0.3282, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 455, "tokens_per_second_per_gpu": 2558.53 }, { "epoch": 2.9472, "grad_norm": 0.4506182372570038, "learning_rate": 0.00014146341463414634, "loss": 0.3247, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 462, "tokens_per_second_per_gpu": 2540.83 }, { "epoch": 2.992, "grad_norm": 0.42775827646255493, "learning_rate": 0.00014056482670089858, "loss": 0.2999, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 469, "tokens_per_second_per_gpu": 2576.4 }, { "epoch": 3.032, "grad_norm": 0.5030285120010376, "learning_rate": 0.00013966623876765085, "loss": 0.2621, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 476, "tokens_per_second_per_gpu": 2292.5 }, { "epoch": 3.0768, "grad_norm": 0.5882396697998047, "learning_rate": 0.0001387676508344031, "loss": 0.25, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 483, "tokens_per_second_per_gpu": 2549.05 }, { "epoch": 3.1216, "grad_norm": 0.564044177532196, "learning_rate": 0.00013786906290115533, "loss": 0.2308, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 490, "tokens_per_second_per_gpu": 2543.07 }, { "epoch": 3.1664, "grad_norm": 0.6049212217330933, "learning_rate": 0.00013697047496790757, "loss": 0.2476, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 497, "tokens_per_second_per_gpu": 2513.19 }, { "epoch": 3.2112, "grad_norm": 0.6576336622238159, "learning_rate": 0.00013607188703465984, "loss": 0.2309, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 504, "tokens_per_second_per_gpu": 2543.95 }, { "epoch": 3.2560000000000002, "grad_norm": 0.5907571315765381, "learning_rate": 0.00013517329910141208, "loss": 0.2321, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 511, "tokens_per_second_per_gpu": 2551.27 }, { "epoch": 3.3008, "grad_norm": 0.5795454978942871, "learning_rate": 0.00013427471116816432, "loss": 0.2237, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 518, "tokens_per_second_per_gpu": 2544.01 }, { "epoch": 3.3456, "grad_norm": 0.6234800219535828, "learning_rate": 0.00013337612323491657, "loss": 0.2414, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 525, "tokens_per_second_per_gpu": 2581.08 }, { "epoch": 3.3904, "grad_norm": 0.6472600698471069, "learning_rate": 0.0001324775353016688, "loss": 0.2452, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 532, "tokens_per_second_per_gpu": 2544.41 }, { "epoch": 3.4352, "grad_norm": 0.5967562794685364, "learning_rate": 0.00013157894736842108, "loss": 0.248, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 539, "tokens_per_second_per_gpu": 2560.68 }, { "epoch": 3.48, "grad_norm": 0.7238852977752686, "learning_rate": 0.00013068035943517332, "loss": 0.2374, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 546, "tokens_per_second_per_gpu": 2550.16 }, { "epoch": 3.5248, "grad_norm": 0.6091212034225464, "learning_rate": 0.00012978177150192553, "loss": 0.2438, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 553, "tokens_per_second_per_gpu": 2573.33 }, { "epoch": 3.5696, "grad_norm": 0.6311376094818115, "learning_rate": 0.0001288831835686778, "loss": 0.2356, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 560, "tokens_per_second_per_gpu": 2546.24 }, { "epoch": 3.6144, "grad_norm": 0.6457822918891907, "learning_rate": 0.00012798459563543004, "loss": 0.2413, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 567, "tokens_per_second_per_gpu": 2522.36 }, { "epoch": 3.6592000000000002, "grad_norm": 0.6935224533081055, "learning_rate": 0.00012708600770218229, "loss": 0.2408, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 574, "tokens_per_second_per_gpu": 2549.34 }, { "epoch": 3.7039999999999997, "grad_norm": 0.5993574857711792, "learning_rate": 0.00012618741976893453, "loss": 0.2519, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 581, "tokens_per_second_per_gpu": 2561.85 }, { "epoch": 3.7488, "grad_norm": 0.6998376846313477, "learning_rate": 0.00012528883183568677, "loss": 0.2325, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 588, "tokens_per_second_per_gpu": 2551.37 }, { "epoch": 3.7936, "grad_norm": 0.6131768822669983, "learning_rate": 0.00012439024390243904, "loss": 0.2456, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 595, "tokens_per_second_per_gpu": 2552.73 }, { "epoch": 3.8384, "grad_norm": 0.5565290451049805, "learning_rate": 0.00012349165596919128, "loss": 0.2322, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 602, "tokens_per_second_per_gpu": 2575.27 }, { "epoch": 3.8832, "grad_norm": 0.5972646474838257, "learning_rate": 0.00012259306803594352, "loss": 0.2517, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 609, "tokens_per_second_per_gpu": 2562.61 }, { "epoch": 3.928, "grad_norm": 0.6241262555122375, "learning_rate": 0.00012169448010269578, "loss": 0.2495, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 616, "tokens_per_second_per_gpu": 2503.33 }, { "epoch": 3.9728, "grad_norm": 0.6228710412979126, "learning_rate": 0.00012079589216944802, "loss": 0.2449, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 623, "tokens_per_second_per_gpu": 2578.7 }, { "epoch": 4.0128, "grad_norm": 0.5616887211799622, "learning_rate": 0.00011989730423620027, "loss": 0.2169, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 630, "tokens_per_second_per_gpu": 2279.14 }, { "epoch": 4.0576, "grad_norm": 0.8560980558395386, "learning_rate": 0.00011899871630295251, "loss": 0.1538, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 637, "tokens_per_second_per_gpu": 2556.17 }, { "epoch": 4.1024, "grad_norm": 0.6041189432144165, "learning_rate": 0.00011810012836970477, "loss": 0.1545, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 644, "tokens_per_second_per_gpu": 2563.05 }, { "epoch": 4.1472, "grad_norm": 0.7379241585731506, "learning_rate": 0.000117201540436457, "loss": 0.1551, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 651, "tokens_per_second_per_gpu": 2535.22 }, { "epoch": 4.192, "grad_norm": 0.7570924758911133, "learning_rate": 0.00011630295250320924, "loss": 0.1509, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 658, "tokens_per_second_per_gpu": 2540.17 }, { "epoch": 4.2368, "grad_norm": 0.8700867891311646, "learning_rate": 0.00011540436456996148, "loss": 0.1427, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 665, "tokens_per_second_per_gpu": 2554.04 }, { "epoch": 4.2816, "grad_norm": 0.7508013844490051, "learning_rate": 0.00011450577663671374, "loss": 0.1532, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 672, "tokens_per_second_per_gpu": 2553.99 }, { "epoch": 4.3264, "grad_norm": 0.7163228392601013, "learning_rate": 0.00011360718870346598, "loss": 0.1539, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 679, "tokens_per_second_per_gpu": 2553.8 }, { "epoch": 4.3712, "grad_norm": 0.7482940554618835, "learning_rate": 0.00011270860077021823, "loss": 0.1613, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 686, "tokens_per_second_per_gpu": 2546.84 }, { "epoch": 4.416, "grad_norm": 0.8410982489585876, "learning_rate": 0.00011181001283697047, "loss": 0.1526, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 693, "tokens_per_second_per_gpu": 2535.88 }, { "epoch": 4.4608, "grad_norm": 0.8021649122238159, "learning_rate": 0.00011091142490372273, "loss": 0.1597, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 700, "tokens_per_second_per_gpu": 2565.13 }, { "epoch": 4.5056, "grad_norm": 0.7209800481796265, "learning_rate": 0.00011001283697047497, "loss": 0.1673, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 707, "tokens_per_second_per_gpu": 2541.84 }, { "epoch": 4.5504, "grad_norm": 0.8158049583435059, "learning_rate": 0.00010911424903722723, "loss": 0.1743, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 714, "tokens_per_second_per_gpu": 2524.95 }, { "epoch": 4.5952, "grad_norm": 0.7977858185768127, "learning_rate": 0.00010821566110397947, "loss": 0.1645, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 721, "tokens_per_second_per_gpu": 2557.49 }, { "epoch": 4.64, "grad_norm": 0.9170898199081421, "learning_rate": 0.00010731707317073172, "loss": 0.1562, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 728, "tokens_per_second_per_gpu": 2558.81 }, { "epoch": 4.6848, "grad_norm": 0.7245842814445496, "learning_rate": 0.00010641848523748397, "loss": 0.1539, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 735, "tokens_per_second_per_gpu": 2591.05 }, { "epoch": 4.7296, "grad_norm": 0.7396605014801025, "learning_rate": 0.00010551989730423622, "loss": 0.1659, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 742, "tokens_per_second_per_gpu": 2524.23 }, { "epoch": 4.7744, "grad_norm": 0.8805770874023438, "learning_rate": 0.00010462130937098844, "loss": 0.1609, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 749, "tokens_per_second_per_gpu": 2550.75 }, { "epoch": 4.8192, "grad_norm": 0.9227202534675598, "learning_rate": 0.00010372272143774069, "loss": 0.1624, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 756, "tokens_per_second_per_gpu": 2560.56 }, { "epoch": 4.864, "grad_norm": 0.8405540585517883, "learning_rate": 0.00010282413350449293, "loss": 0.174, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 763, "tokens_per_second_per_gpu": 2552.43 }, { "epoch": 4.9088, "grad_norm": 0.9182495474815369, "learning_rate": 0.00010192554557124519, "loss": 0.1616, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 770, "tokens_per_second_per_gpu": 2565.39 }, { "epoch": 4.9536, "grad_norm": 0.8225792646408081, "learning_rate": 0.00010102695763799743, "loss": 0.1531, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 777, "tokens_per_second_per_gpu": 2539.22 }, { "epoch": 4.9984, "grad_norm": 0.8054759502410889, "learning_rate": 0.00010012836970474968, "loss": 0.1626, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 784, "tokens_per_second_per_gpu": 2546.63 }, { "epoch": 5.0384, "grad_norm": 0.7499905228614807, "learning_rate": 9.922978177150193e-05, "loss": 0.0926, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 791, "tokens_per_second_per_gpu": 2266.09 }, { "epoch": 5.0832, "grad_norm": 0.7659611701965332, "learning_rate": 9.833119383825418e-05, "loss": 0.0901, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 798, "tokens_per_second_per_gpu": 2526.01 }, { "epoch": 5.128, "grad_norm": 0.7457947134971619, "learning_rate": 9.743260590500642e-05, "loss": 0.0969, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 805, "tokens_per_second_per_gpu": 2548.79 }, { "epoch": 5.1728, "grad_norm": 0.671251654624939, "learning_rate": 9.653401797175868e-05, "loss": 0.0868, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 812, "tokens_per_second_per_gpu": 2576.43 }, { "epoch": 5.2176, "grad_norm": 0.6863052248954773, "learning_rate": 9.563543003851092e-05, "loss": 0.0925, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 819, "tokens_per_second_per_gpu": 2552.44 }, { "epoch": 5.2624, "grad_norm": 0.7440989017486572, "learning_rate": 9.473684210526316e-05, "loss": 0.0869, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 826, "tokens_per_second_per_gpu": 2553.45 }, { "epoch": 5.3072, "grad_norm": 0.8614964485168457, "learning_rate": 9.38382541720154e-05, "loss": 0.0955, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 833, "tokens_per_second_per_gpu": 2562.69 }, { "epoch": 5.352, "grad_norm": 0.9427368640899658, "learning_rate": 9.293966623876766e-05, "loss": 0.098, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 840, "tokens_per_second_per_gpu": 2548.07 }, { "epoch": 5.3968, "grad_norm": 0.936646044254303, "learning_rate": 9.20410783055199e-05, "loss": 0.1011, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 847, "tokens_per_second_per_gpu": 2542.11 }, { "epoch": 5.4416, "grad_norm": 0.7434241771697998, "learning_rate": 9.114249037227216e-05, "loss": 0.1016, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 854, "tokens_per_second_per_gpu": 2542.61 }, { "epoch": 5.4864, "grad_norm": 0.8048710823059082, "learning_rate": 9.02439024390244e-05, "loss": 0.0926, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 861, "tokens_per_second_per_gpu": 2533.62 }, { "epoch": 5.5312, "grad_norm": 0.9675536751747131, "learning_rate": 8.934531450577664e-05, "loss": 0.0955, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 868, "tokens_per_second_per_gpu": 2548.0 }, { "epoch": 5.576, "grad_norm": 0.7267205715179443, "learning_rate": 8.84467265725289e-05, "loss": 0.1062, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 875, "tokens_per_second_per_gpu": 2540.13 }, { "epoch": 5.6208, "grad_norm": 0.7611416578292847, "learning_rate": 8.754813863928114e-05, "loss": 0.1031, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 882, "tokens_per_second_per_gpu": 2547.44 }, { "epoch": 5.6655999999999995, "grad_norm": 0.781144917011261, "learning_rate": 8.664955070603338e-05, "loss": 0.1007, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 889, "tokens_per_second_per_gpu": 2549.36 }, { "epoch": 5.7104, "grad_norm": 0.9141478538513184, "learning_rate": 8.575096277278562e-05, "loss": 0.0955, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 896, "tokens_per_second_per_gpu": 2551.51 }, { "epoch": 5.7552, "grad_norm": 0.8914825320243835, "learning_rate": 8.485237483953787e-05, "loss": 0.0874, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 903, "tokens_per_second_per_gpu": 2553.0 }, { "epoch": 5.8, "grad_norm": 0.8610872626304626, "learning_rate": 8.395378690629012e-05, "loss": 0.0967, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 910, "tokens_per_second_per_gpu": 2563.66 }, { "epoch": 5.8448, "grad_norm": 0.7397611737251282, "learning_rate": 8.305519897304237e-05, "loss": 0.1016, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 917, "tokens_per_second_per_gpu": 2525.33 }, { "epoch": 5.8896, "grad_norm": 0.8295127153396606, "learning_rate": 8.215661103979461e-05, "loss": 0.0931, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 924, "tokens_per_second_per_gpu": 2560.32 }, { "epoch": 5.9344, "grad_norm": 0.8457576632499695, "learning_rate": 8.125802310654685e-05, "loss": 0.1033, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 931, "tokens_per_second_per_gpu": 2573.24 }, { "epoch": 5.9792, "grad_norm": 0.6747003793716431, "learning_rate": 8.03594351732991e-05, "loss": 0.0957, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 938, "tokens_per_second_per_gpu": 2533.53 }, { "epoch": 6.0192, "grad_norm": 0.4863660931587219, "learning_rate": 7.946084724005135e-05, "loss": 0.08, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 945, "tokens_per_second_per_gpu": 2300.32 }, { "epoch": 6.064, "grad_norm": 0.7925319075584412, "learning_rate": 7.85622593068036e-05, "loss": 0.0528, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 952, "tokens_per_second_per_gpu": 2545.45 }, { "epoch": 6.1088, "grad_norm": 0.7199363112449646, "learning_rate": 7.766367137355585e-05, "loss": 0.0546, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 959, "tokens_per_second_per_gpu": 2562.73 }, { "epoch": 6.1536, "grad_norm": 0.8268409967422485, "learning_rate": 7.676508344030809e-05, "loss": 0.0527, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 966, "tokens_per_second_per_gpu": 2550.58 }, { "epoch": 6.1984, "grad_norm": 0.7279261350631714, "learning_rate": 7.586649550706035e-05, "loss": 0.0478, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 973, "tokens_per_second_per_gpu": 2552.26 }, { "epoch": 6.2432, "grad_norm": 0.808081865310669, "learning_rate": 7.496790757381257e-05, "loss": 0.0575, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 980, "tokens_per_second_per_gpu": 2551.31 }, { "epoch": 6.288, "grad_norm": 0.6677886843681335, "learning_rate": 7.406931964056483e-05, "loss": 0.0538, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 987, "tokens_per_second_per_gpu": 2544.12 }, { "epoch": 6.3328, "grad_norm": 0.6135331392288208, "learning_rate": 7.317073170731707e-05, "loss": 0.056, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 994, "tokens_per_second_per_gpu": 2534.9 }, { "epoch": 6.3776, "grad_norm": 0.7025336027145386, "learning_rate": 7.227214377406933e-05, "loss": 0.0578, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1001, "tokens_per_second_per_gpu": 2529.9 }, { "epoch": 6.4224, "grad_norm": 0.596121072769165, "learning_rate": 7.137355584082157e-05, "loss": 0.0548, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1008, "tokens_per_second_per_gpu": 2556.17 }, { "epoch": 6.4672, "grad_norm": 0.7072296142578125, "learning_rate": 7.047496790757382e-05, "loss": 0.0573, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1015, "tokens_per_second_per_gpu": 2543.85 }, { "epoch": 6.5120000000000005, "grad_norm": 0.7128002643585205, "learning_rate": 6.957637997432606e-05, "loss": 0.0519, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1022, "tokens_per_second_per_gpu": 2566.17 }, { "epoch": 6.5568, "grad_norm": 0.7213091850280762, "learning_rate": 6.86777920410783e-05, "loss": 0.0575, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1029, "tokens_per_second_per_gpu": 2558.61 }, { "epoch": 6.6016, "grad_norm": 0.7774225473403931, "learning_rate": 6.777920410783055e-05, "loss": 0.0565, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1036, "tokens_per_second_per_gpu": 2568.69 }, { "epoch": 6.6464, "grad_norm": 0.6724172234535217, "learning_rate": 6.68806161745828e-05, "loss": 0.057, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1043, "tokens_per_second_per_gpu": 2553.57 }, { "epoch": 6.6912, "grad_norm": 0.6828070282936096, "learning_rate": 6.598202824133504e-05, "loss": 0.0515, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1050, "tokens_per_second_per_gpu": 2542.67 }, { "epoch": 6.736, "grad_norm": 0.6932708621025085, "learning_rate": 6.50834403080873e-05, "loss": 0.0615, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1057, "tokens_per_second_per_gpu": 2542.55 }, { "epoch": 6.7808, "grad_norm": 0.7474423050880432, "learning_rate": 6.418485237483954e-05, "loss": 0.0585, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1064, "tokens_per_second_per_gpu": 2523.75 }, { "epoch": 6.8256, "grad_norm": 0.7136902213096619, "learning_rate": 6.32862644415918e-05, "loss": 0.0576, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1071, "tokens_per_second_per_gpu": 2564.22 }, { "epoch": 6.8704, "grad_norm": 0.8908403515815735, "learning_rate": 6.238767650834402e-05, "loss": 0.0613, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1078, "tokens_per_second_per_gpu": 2512.08 }, { "epoch": 6.9152000000000005, "grad_norm": 0.744345486164093, "learning_rate": 6.148908857509628e-05, "loss": 0.0589, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1085, "tokens_per_second_per_gpu": 2577.36 }, { "epoch": 6.96, "grad_norm": 0.8013383150100708, "learning_rate": 6.059050064184852e-05, "loss": 0.0575, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1092, "tokens_per_second_per_gpu": 2552.63 }, { "epoch": 7.0, "grad_norm": 1.1780116558074951, "learning_rate": 5.969191270860077e-05, "loss": 0.059, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1099, "tokens_per_second_per_gpu": 8733.1 }, { "epoch": 7.0448, "grad_norm": 0.5212501883506775, "learning_rate": 5.879332477535302e-05, "loss": 0.0312, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1106, "tokens_per_second_per_gpu": 2550.65 }, { "epoch": 7.0896, "grad_norm": 0.6150863766670227, "learning_rate": 5.789473684210527e-05, "loss": 0.0329, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1113, "tokens_per_second_per_gpu": 2559.65 }, { "epoch": 7.1344, "grad_norm": 0.6554487943649292, "learning_rate": 5.6996148908857515e-05, "loss": 0.0331, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1120, "tokens_per_second_per_gpu": 2554.81 }, { "epoch": 7.1792, "grad_norm": 0.539727509021759, "learning_rate": 5.6097560975609764e-05, "loss": 0.0319, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1127, "tokens_per_second_per_gpu": 2540.64 }, { "epoch": 7.224, "grad_norm": 0.5467194318771362, "learning_rate": 5.5198973042362e-05, "loss": 0.0281, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1134, "tokens_per_second_per_gpu": 2564.31 }, { "epoch": 7.2688, "grad_norm": 0.7032052874565125, "learning_rate": 5.430038510911425e-05, "loss": 0.0332, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1141, "tokens_per_second_per_gpu": 2532.96 }, { "epoch": 7.3136, "grad_norm": 0.5758365392684937, "learning_rate": 5.3401797175866496e-05, "loss": 0.0339, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1148, "tokens_per_second_per_gpu": 2534.32 }, { "epoch": 7.3584, "grad_norm": 0.555092990398407, "learning_rate": 5.2503209242618744e-05, "loss": 0.0315, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1155, "tokens_per_second_per_gpu": 2554.71 }, { "epoch": 7.4032, "grad_norm": 0.5572156310081482, "learning_rate": 5.160462130937099e-05, "loss": 0.0313, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1162, "tokens_per_second_per_gpu": 2563.31 }, { "epoch": 7.448, "grad_norm": 0.6063708662986755, "learning_rate": 5.070603337612324e-05, "loss": 0.033, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1169, "tokens_per_second_per_gpu": 2561.6 }, { "epoch": 7.4928, "grad_norm": 0.5049571394920349, "learning_rate": 4.980744544287548e-05, "loss": 0.0318, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1176, "tokens_per_second_per_gpu": 2534.93 }, { "epoch": 7.5376, "grad_norm": 0.6519291400909424, "learning_rate": 4.890885750962773e-05, "loss": 0.0328, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1183, "tokens_per_second_per_gpu": 2517.86 }, { "epoch": 7.5824, "grad_norm": 0.4828682243824005, "learning_rate": 4.801026957637998e-05, "loss": 0.0308, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1190, "tokens_per_second_per_gpu": 2534.7 }, { "epoch": 7.6272, "grad_norm": 0.6436623334884644, "learning_rate": 4.711168164313222e-05, "loss": 0.0357, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1197, "tokens_per_second_per_gpu": 2545.39 }, { "epoch": 7.672, "grad_norm": 0.5696656703948975, "learning_rate": 4.621309370988447e-05, "loss": 0.0332, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1204, "tokens_per_second_per_gpu": 2571.5 }, { "epoch": 7.7168, "grad_norm": 0.5414486527442932, "learning_rate": 4.531450577663672e-05, "loss": 0.0402, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1211, "tokens_per_second_per_gpu": 2559.4 }, { "epoch": 7.7616, "grad_norm": 0.4999183118343353, "learning_rate": 4.441591784338896e-05, "loss": 0.0387, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1218, "tokens_per_second_per_gpu": 2548.04 }, { "epoch": 7.8064, "grad_norm": 0.48149573802948, "learning_rate": 4.351732991014121e-05, "loss": 0.0326, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1225, "tokens_per_second_per_gpu": 2537.79 }, { "epoch": 7.8512, "grad_norm": 0.5408401489257812, "learning_rate": 4.261874197689346e-05, "loss": 0.0338, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1232, "tokens_per_second_per_gpu": 2537.95 }, { "epoch": 7.896, "grad_norm": 0.5644415020942688, "learning_rate": 4.1720154043645705e-05, "loss": 0.0379, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1239, "tokens_per_second_per_gpu": 2542.75 }, { "epoch": 7.9408, "grad_norm": 0.5069785714149475, "learning_rate": 4.082156611039795e-05, "loss": 0.0363, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1246, "tokens_per_second_per_gpu": 2569.39 }, { "epoch": 7.9856, "grad_norm": 0.6117646098136902, "learning_rate": 3.9922978177150195e-05, "loss": 0.0381, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1253, "tokens_per_second_per_gpu": 2541.47 }, { "epoch": 8.0256, "grad_norm": 0.341382771730423, "learning_rate": 3.9024390243902444e-05, "loss": 0.0262, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1260, "tokens_per_second_per_gpu": 2283.99 }, { "epoch": 8.0704, "grad_norm": 0.37654903531074524, "learning_rate": 3.8125802310654686e-05, "loss": 0.0213, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1267, "tokens_per_second_per_gpu": 2548.25 }, { "epoch": 8.1152, "grad_norm": 0.34505343437194824, "learning_rate": 3.7227214377406934e-05, "loss": 0.0195, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1274, "tokens_per_second_per_gpu": 2524.2 }, { "epoch": 8.16, "grad_norm": 0.557098388671875, "learning_rate": 3.632862644415918e-05, "loss": 0.0236, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1281, "tokens_per_second_per_gpu": 2553.97 }, { "epoch": 8.2048, "grad_norm": 0.32856565713882446, "learning_rate": 3.543003851091143e-05, "loss": 0.0185, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1288, "tokens_per_second_per_gpu": 2532.28 }, { "epoch": 8.2496, "grad_norm": 0.37007957696914673, "learning_rate": 3.453145057766367e-05, "loss": 0.0173, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1295, "tokens_per_second_per_gpu": 2554.02 }, { "epoch": 8.2944, "grad_norm": 0.39841657876968384, "learning_rate": 3.363286264441592e-05, "loss": 0.0209, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1302, "tokens_per_second_per_gpu": 2564.15 }, { "epoch": 8.3392, "grad_norm": 0.49784979224205017, "learning_rate": 3.273427471116817e-05, "loss": 0.0216, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1309, "tokens_per_second_per_gpu": 2565.58 }, { "epoch": 8.384, "grad_norm": 0.4872147738933563, "learning_rate": 3.183568677792042e-05, "loss": 0.0213, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1316, "tokens_per_second_per_gpu": 2521.34 }, { "epoch": 8.4288, "grad_norm": 0.40467020869255066, "learning_rate": 3.093709884467266e-05, "loss": 0.0211, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1323, "tokens_per_second_per_gpu": 2527.11 }, { "epoch": 8.4736, "grad_norm": 0.4135974943637848, "learning_rate": 3.0038510911424905e-05, "loss": 0.0216, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1330, "tokens_per_second_per_gpu": 2524.78 }, { "epoch": 8.5184, "grad_norm": 0.5452598333358765, "learning_rate": 2.9139922978177153e-05, "loss": 0.0221, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1337, "tokens_per_second_per_gpu": 2550.36 }, { "epoch": 8.5632, "grad_norm": 0.4054325819015503, "learning_rate": 2.8241335044929395e-05, "loss": 0.0241, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1344, "tokens_per_second_per_gpu": 2559.59 }, { "epoch": 8.608, "grad_norm": 0.4412683844566345, "learning_rate": 2.7342747111681643e-05, "loss": 0.0223, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1351, "tokens_per_second_per_gpu": 2566.21 }, { "epoch": 8.6528, "grad_norm": 0.4093833863735199, "learning_rate": 2.6444159178433892e-05, "loss": 0.0248, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1358, "tokens_per_second_per_gpu": 2555.89 }, { "epoch": 8.6976, "grad_norm": 0.3637797236442566, "learning_rate": 2.554557124518614e-05, "loss": 0.0195, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1365, "tokens_per_second_per_gpu": 2553.17 }, { "epoch": 8.7424, "grad_norm": 0.553248405456543, "learning_rate": 2.4646983311938385e-05, "loss": 0.0264, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1372, "tokens_per_second_per_gpu": 2551.19 }, { "epoch": 8.7872, "grad_norm": 0.4234694838523865, "learning_rate": 2.374839537869063e-05, "loss": 0.0234, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1379, "tokens_per_second_per_gpu": 2543.4 }, { "epoch": 8.832, "grad_norm": 0.517890453338623, "learning_rate": 2.284980744544288e-05, "loss": 0.0223, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1386, "tokens_per_second_per_gpu": 2541.45 }, { "epoch": 8.8768, "grad_norm": 0.3650319576263428, "learning_rate": 2.1951219512195124e-05, "loss": 0.0211, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1393, "tokens_per_second_per_gpu": 2552.94 }, { "epoch": 8.9216, "grad_norm": 0.40671035647392273, "learning_rate": 2.105263157894737e-05, "loss": 0.0234, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1400, "tokens_per_second_per_gpu": 2555.97 }, { "epoch": 8.9664, "grad_norm": 0.42694664001464844, "learning_rate": 2.0154043645699617e-05, "loss": 0.0228, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1407, "tokens_per_second_per_gpu": 2559.05 }, { "epoch": 9.0064, "grad_norm": 0.3225362300872803, "learning_rate": 1.9255455712451862e-05, "loss": 0.0203, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1414, "tokens_per_second_per_gpu": 2270.52 }, { "epoch": 9.0512, "grad_norm": 0.324491411447525, "learning_rate": 1.835686777920411e-05, "loss": 0.0163, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1421, "tokens_per_second_per_gpu": 2544.3 }, { "epoch": 9.096, "grad_norm": 0.4213584065437317, "learning_rate": 1.7458279845956356e-05, "loss": 0.0152, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1428, "tokens_per_second_per_gpu": 2543.63 }, { "epoch": 9.1408, "grad_norm": 0.3669317066669464, "learning_rate": 1.65596919127086e-05, "loss": 0.0144, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1435, "tokens_per_second_per_gpu": 2571.2 }, { "epoch": 9.1856, "grad_norm": 0.38088613748550415, "learning_rate": 1.5661103979460846e-05, "loss": 0.013, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1442, "tokens_per_second_per_gpu": 2523.16 }, { "epoch": 9.2304, "grad_norm": 0.3001616597175598, "learning_rate": 1.4762516046213096e-05, "loss": 0.0154, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1449, "tokens_per_second_per_gpu": 2543.85 }, { "epoch": 9.2752, "grad_norm": 0.40668153762817383, "learning_rate": 1.3863928112965341e-05, "loss": 0.0156, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1456, "tokens_per_second_per_gpu": 2545.97 }, { "epoch": 9.32, "grad_norm": 0.3763693869113922, "learning_rate": 1.2965340179717586e-05, "loss": 0.0146, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1463, "tokens_per_second_per_gpu": 2572.45 }, { "epoch": 9.3648, "grad_norm": 0.3395196795463562, "learning_rate": 1.2066752246469835e-05, "loss": 0.0169, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1470, "tokens_per_second_per_gpu": 2536.65 }, { "epoch": 9.4096, "grad_norm": 0.45923373103141785, "learning_rate": 1.116816431322208e-05, "loss": 0.0156, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1477, "tokens_per_second_per_gpu": 2545.81 }, { "epoch": 9.4544, "grad_norm": 0.42670783400535583, "learning_rate": 1.0269576379974327e-05, "loss": 0.015, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1484, "tokens_per_second_per_gpu": 2535.56 }, { "epoch": 9.4992, "grad_norm": 0.3387915790081024, "learning_rate": 9.370988446726572e-06, "loss": 0.0135, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1491, "tokens_per_second_per_gpu": 2534.57 }, { "epoch": 9.544, "grad_norm": 0.3647075593471527, "learning_rate": 8.472400513478818e-06, "loss": 0.014, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1498, "tokens_per_second_per_gpu": 2555.23 }, { "epoch": 9.588799999999999, "grad_norm": 0.3380165696144104, "learning_rate": 7.573812580231065e-06, "loss": 0.0145, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1505, "tokens_per_second_per_gpu": 2530.66 }, { "epoch": 9.6336, "grad_norm": 0.3743992745876312, "learning_rate": 6.675224646983312e-06, "loss": 0.0157, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1512, "tokens_per_second_per_gpu": 2524.24 }, { "epoch": 9.6784, "grad_norm": 0.33638349175453186, "learning_rate": 5.776636713735559e-06, "loss": 0.0154, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1519, "tokens_per_second_per_gpu": 2564.45 }, { "epoch": 9.7232, "grad_norm": 0.3695032596588135, "learning_rate": 4.8780487804878055e-06, "loss": 0.0153, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1526, "tokens_per_second_per_gpu": 2543.93 }, { "epoch": 9.768, "grad_norm": 0.3002121150493622, "learning_rate": 3.979460847240052e-06, "loss": 0.0155, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1533, "tokens_per_second_per_gpu": 2531.73 }, { "epoch": 9.8128, "grad_norm": 0.3316121995449066, "learning_rate": 3.080872913992298e-06, "loss": 0.0138, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1540, "tokens_per_second_per_gpu": 2552.69 }, { "epoch": 9.8576, "grad_norm": 0.46835947036743164, "learning_rate": 2.1822849807445445e-06, "loss": 0.0158, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1547, "tokens_per_second_per_gpu": 2561.07 }, { "epoch": 9.9024, "grad_norm": 0.4010881781578064, "learning_rate": 1.2836970474967908e-06, "loss": 0.0165, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1554, "tokens_per_second_per_gpu": 2561.87 }, { "epoch": 9.9472, "grad_norm": 0.4308512806892395, "learning_rate": 3.8510911424903727e-07, "loss": 0.0164, "memory/device_reserved (GiB)": 45.82, "memory/max_active (GiB)": 44.04, "memory/max_allocated (GiB)": 44.04, "step": 1561, "tokens_per_second_per_gpu": 2552.16 } ], "logging_steps": 7, "max_steps": 1563, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.135932040183808e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }